file_source.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430
  1. // Copyright 2021-2023 EMQ Technologies Co., Ltd.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package file
  15. import (
  16. "bufio"
  17. "encoding/csv"
  18. "encoding/json"
  19. "errors"
  20. "fmt"
  21. "io"
  22. "os"
  23. "path/filepath"
  24. "strconv"
  25. "strings"
  26. "time"
  27. "github.com/klauspost/compress/gzip"
  28. "github.com/klauspost/compress/zstd"
  29. "github.com/lf-edge/ekuiper/internal/conf"
  30. "github.com/lf-edge/ekuiper/internal/xsql"
  31. "github.com/lf-edge/ekuiper/pkg/api"
  32. "github.com/lf-edge/ekuiper/pkg/cast"
  33. )
  34. type FileSourceConfig struct {
  35. FileType FileType `json:"fileType"`
  36. Path string `json:"path"`
  37. Interval int `json:"interval"`
  38. IsTable bool `json:"isTable"`
  39. SendInterval int `json:"sendInterval"`
  40. ActionAfterRead int `json:"actionAfterRead"`
  41. MoveTo string `json:"moveTo"`
  42. HasHeader bool `json:"hasHeader"`
  43. Columns []string `json:"columns"`
  44. IgnoreStartLines int `json:"ignoreStartLines"`
  45. IgnoreEndLines int `json:"ignoreEndLines"`
  46. Delimiter string `json:"delimiter"`
  47. Decompression string `json:"decompression"`
  48. }
  49. // FileSource The BATCH to load data from file at once
  50. type FileSource struct {
  51. file string
  52. isDir bool
  53. config *FileSourceConfig
  54. }
  55. func (fs *FileSource) Close(ctx api.StreamContext) error {
  56. ctx.GetLogger().Infof("Close file source")
  57. // do nothing
  58. return nil
  59. }
  60. func (fs *FileSource) Configure(fileName string, props map[string]interface{}) error {
  61. cfg := &FileSourceConfig{
  62. FileType: JSON_TYPE,
  63. }
  64. err := cast.MapToStruct(props, cfg)
  65. if err != nil {
  66. return fmt.Errorf("read properties %v fail with error: %v", props, err)
  67. }
  68. if cfg.FileType == "" {
  69. return errors.New("missing or invalid property fileType, must be 'json'")
  70. }
  71. if _, ok := fileTypes[cfg.FileType]; !ok {
  72. return fmt.Errorf("invalid property fileType: %s", cfg.FileType)
  73. }
  74. if cfg.Path == "" {
  75. return errors.New("missing property Path")
  76. }
  77. if !filepath.IsAbs(cfg.Path) {
  78. cfg.Path, err = conf.GetLoc(cfg.Path)
  79. if err != nil {
  80. return fmt.Errorf("invalid path %s", cfg.Path)
  81. }
  82. }
  83. if fileName != "/$$TEST_CONNECTION$$" {
  84. fs.file = filepath.Join(cfg.Path, fileName)
  85. fi, err := os.Stat(fs.file)
  86. if err != nil {
  87. if os.IsNotExist(err) {
  88. return fmt.Errorf("file %s not exist", fs.file)
  89. }
  90. }
  91. if fi.IsDir() {
  92. fs.isDir = true
  93. }
  94. }
  95. if cfg.IgnoreStartLines < 0 {
  96. cfg.IgnoreStartLines = 0
  97. }
  98. if cfg.IgnoreEndLines < 0 {
  99. cfg.IgnoreEndLines = 0
  100. }
  101. if cfg.ActionAfterRead < 0 || cfg.ActionAfterRead > 2 {
  102. return fmt.Errorf("invalid actionAfterRead: %d", cfg.ActionAfterRead)
  103. }
  104. if cfg.ActionAfterRead == 2 {
  105. if cfg.MoveTo == "" {
  106. return fmt.Errorf("missing moveTo when actionAfterRead is 2")
  107. } else {
  108. if !filepath.IsAbs(cfg.MoveTo) {
  109. cfg.MoveTo, err = conf.GetLoc(cfg.MoveTo)
  110. if err != nil {
  111. return fmt.Errorf("invalid moveTo %s: %v", cfg.MoveTo, err)
  112. }
  113. }
  114. fileInfo, err := os.Stat(cfg.MoveTo)
  115. if err != nil {
  116. err := os.MkdirAll(cfg.MoveTo, os.ModePerm)
  117. if err != nil {
  118. return fmt.Errorf("fail to create dir for moveTo %s: %v", cfg.MoveTo, err)
  119. }
  120. } else if !fileInfo.IsDir() {
  121. return fmt.Errorf("moveTo %s is not a directory", cfg.MoveTo)
  122. }
  123. }
  124. }
  125. if cfg.Delimiter == "" {
  126. cfg.Delimiter = ","
  127. }
  128. if _, ok := compressionTypes[cfg.Decompression]; !ok && cfg.Decompression != "" {
  129. return fmt.Errorf("decompression must be one of gzip, zstd")
  130. }
  131. fs.config = cfg
  132. return nil
  133. }
  134. func (fs *FileSource) Open(ctx api.StreamContext, consumer chan<- api.SourceTuple, errCh chan<- error) {
  135. err := fs.Load(ctx, consumer)
  136. if err != nil {
  137. select {
  138. case consumer <- &xsql.ErrorSourceTuple{Error: err}:
  139. ctx.GetLogger().Errorf("find error when loading file %s with err %v", fs.file, err)
  140. case <-ctx.Done():
  141. return
  142. }
  143. }
  144. if fs.config.Interval > 0 {
  145. ticker := time.NewTicker(time.Millisecond * time.Duration(fs.config.Interval))
  146. logger := ctx.GetLogger()
  147. defer ticker.Stop()
  148. for {
  149. select {
  150. case <-ticker.C:
  151. logger.Debugf("Load file source again at %v", conf.GetNowInMilli())
  152. err := fs.Load(ctx, consumer)
  153. if err != nil {
  154. errCh <- err
  155. return
  156. }
  157. case <-ctx.Done():
  158. return
  159. }
  160. }
  161. }
  162. }
  163. func (fs *FileSource) Load(ctx api.StreamContext, consumer chan<- api.SourceTuple) error {
  164. rcvTime := conf.GetNow()
  165. if fs.isDir {
  166. ctx.GetLogger().Debugf("Monitor dir %s", fs.file)
  167. entries, err := os.ReadDir(fs.file)
  168. if err != nil {
  169. return err
  170. }
  171. for _, entry := range entries {
  172. if entry.IsDir() {
  173. continue
  174. }
  175. file := filepath.Join(fs.file, entry.Name())
  176. err := fs.parseFile(ctx, file, consumer)
  177. if err != nil {
  178. ctx.GetLogger().Errorf("parse file %s fail with error: %v", file, err)
  179. continue
  180. }
  181. }
  182. } else {
  183. err := fs.parseFile(ctx, fs.file, consumer)
  184. if err != nil {
  185. return err
  186. }
  187. }
  188. // Send EOF if retain size not set if used in table
  189. if fs.config.IsTable {
  190. select {
  191. case consumer <- api.NewDefaultSourceTupleWithTime(nil, nil, rcvTime):
  192. // do nothing
  193. case <-ctx.Done():
  194. return nil
  195. }
  196. }
  197. ctx.GetLogger().Debug("All tuples sent")
  198. return nil
  199. }
  200. func (fs *FileSource) parseFile(ctx api.StreamContext, file string, consumer chan<- api.SourceTuple) (result error) {
  201. r, err := fs.prepareFile(ctx, file)
  202. if err != nil {
  203. ctx.GetLogger().Debugf("prepare file %s error: %v", file, err)
  204. return err
  205. }
  206. meta := map[string]interface{}{
  207. "file": file,
  208. }
  209. defer func() {
  210. ctx.GetLogger().Debugf("Finish loading from file %s", file)
  211. if closer, ok := r.(io.Closer); ok {
  212. ctx.GetLogger().Debugf("Close reader")
  213. closer.Close()
  214. }
  215. if result == nil {
  216. switch fs.config.ActionAfterRead {
  217. case 1:
  218. if err := os.Remove(file); err != nil {
  219. result = err
  220. }
  221. ctx.GetLogger().Debugf("Remove file %s", file)
  222. case 2:
  223. targetFile := filepath.Join(fs.config.MoveTo, filepath.Base(file))
  224. if err := os.Rename(file, targetFile); err != nil {
  225. result = err
  226. }
  227. ctx.GetLogger().Debugf("Move file %s to %s", file, targetFile)
  228. }
  229. }
  230. }()
  231. return fs.publish(ctx, r, consumer, meta)
  232. }
  233. func (fs *FileSource) publish(ctx api.StreamContext, file io.Reader, consumer chan<- api.SourceTuple, meta map[string]interface{}) error {
  234. ctx.GetLogger().Debug("Start to load")
  235. rcvTime := conf.GetNow()
  236. switch fs.config.FileType {
  237. case JSON_TYPE:
  238. r := json.NewDecoder(file)
  239. resultMap := make([]map[string]interface{}, 0)
  240. err := r.Decode(&resultMap)
  241. if err != nil {
  242. return fmt.Errorf("loaded %s, check error %s", fs.file, err)
  243. }
  244. ctx.GetLogger().Debug("Sending tuples")
  245. for _, m := range resultMap {
  246. select {
  247. case consumer <- api.NewDefaultSourceTupleWithTime(m, meta, rcvTime):
  248. case <-ctx.Done():
  249. return nil
  250. }
  251. if fs.config.SendInterval > 0 {
  252. time.Sleep(time.Millisecond * time.Duration(fs.config.SendInterval))
  253. }
  254. }
  255. return nil
  256. case CSV_TYPE:
  257. r := csv.NewReader(file)
  258. r.Comma = rune(fs.config.Delimiter[0])
  259. r.TrimLeadingSpace = true
  260. r.FieldsPerRecord = -1
  261. cols := fs.config.Columns
  262. if fs.config.HasHeader {
  263. var err error
  264. ctx.GetLogger().Debug("Has header")
  265. cols, err = r.Read()
  266. if err == io.EOF {
  267. break
  268. }
  269. if err != nil {
  270. ctx.GetLogger().Warnf("Read file %s encounter error: %v", fs.file, err)
  271. return err
  272. }
  273. ctx.GetLogger().Debugf("Got header %v", cols)
  274. }
  275. for {
  276. record, err := r.Read()
  277. if err == io.EOF {
  278. break
  279. }
  280. if err != nil {
  281. ctx.GetLogger().Warnf("Read file %s encounter error: %v", fs.file, err)
  282. continue
  283. }
  284. ctx.GetLogger().Debugf("Read" + strings.Join(record, ","))
  285. var m map[string]interface{}
  286. if cols == nil {
  287. m = make(map[string]interface{}, len(record))
  288. for i, v := range record {
  289. m["cols"+strconv.Itoa(i)] = v
  290. }
  291. } else {
  292. m = make(map[string]interface{}, len(cols))
  293. for i, v := range cols {
  294. m[v] = record[i]
  295. }
  296. }
  297. select {
  298. case consumer <- api.NewDefaultSourceTupleWithTime(m, meta, rcvTime):
  299. case <-ctx.Done():
  300. return nil
  301. }
  302. if fs.config.SendInterval > 0 {
  303. time.Sleep(time.Millisecond * time.Duration(fs.config.SendInterval))
  304. }
  305. }
  306. case LINES_TYPE:
  307. scanner := bufio.NewScanner(file)
  308. scanner.Split(bufio.ScanLines)
  309. for scanner.Scan() {
  310. var tuples []api.SourceTuple
  311. m, err := ctx.DecodeIntoList(scanner.Bytes())
  312. if err != nil {
  313. tuples = []api.SourceTuple{&xsql.ErrorSourceTuple{
  314. Error: fmt.Errorf("Invalid data format, cannot decode %s with error %s", scanner.Text(), err),
  315. }}
  316. } else {
  317. for _, t := range m {
  318. tuples = append(tuples, api.NewDefaultSourceTupleWithTime(t, meta, rcvTime))
  319. }
  320. }
  321. for _, tuple := range tuples {
  322. select {
  323. case consumer <- tuple:
  324. case <-ctx.Done():
  325. return nil
  326. }
  327. }
  328. if fs.config.SendInterval > 0 {
  329. time.Sleep(time.Millisecond * time.Duration(fs.config.SendInterval))
  330. }
  331. }
  332. default:
  333. return fmt.Errorf("invalid file type %s", fs.config.FileType)
  334. }
  335. return nil
  336. }
  337. // prepareFile prepare file by deleting ignore lines
  338. func (fs *FileSource) prepareFile(ctx api.StreamContext, file string) (io.Reader, error) {
  339. f, err := os.Open(file)
  340. if err != nil {
  341. ctx.GetLogger().Error(err)
  342. return nil, err
  343. }
  344. var reader io.ReadCloser
  345. switch fs.config.Decompression {
  346. case GZIP:
  347. newReader, err := gzip.NewReader(f)
  348. if err != nil {
  349. return nil, err
  350. }
  351. reader = newReader
  352. case ZSTD:
  353. newReader, err := zstd.NewReader(f)
  354. if err != nil {
  355. return nil, err
  356. }
  357. reader = newReader.IOReadCloser()
  358. default:
  359. reader = f
  360. }
  361. if fs.config.IgnoreStartLines > 0 || fs.config.IgnoreEndLines > 0 {
  362. r, w := io.Pipe()
  363. go func() {
  364. defer func() {
  365. ctx.GetLogger().Debugf("Close pipe files %s", file)
  366. w.Close()
  367. reader.Close()
  368. }()
  369. scanner := bufio.NewScanner(reader)
  370. scanner.Split(bufio.ScanLines)
  371. ln := 0
  372. // This is a queue to store the lines that should be ignored
  373. tempLines := make([]string, 0, fs.config.IgnoreEndLines)
  374. for scanner.Scan() {
  375. if ln >= fs.config.IgnoreStartLines {
  376. if fs.config.IgnoreEndLines > 0 { // the last n line are left in the tempLines
  377. slot := (ln - fs.config.IgnoreStartLines) % fs.config.IgnoreEndLines
  378. if len(tempLines) <= slot { // first round
  379. tempLines = append(tempLines, scanner.Text())
  380. } else {
  381. _, err := w.Write([]byte(tempLines[slot]))
  382. if err != nil {
  383. ctx.GetLogger().Error(err)
  384. break
  385. }
  386. _, err = w.Write([]byte{'\n'})
  387. if err != nil {
  388. ctx.GetLogger().Error(err)
  389. break
  390. }
  391. tempLines[slot] = scanner.Text()
  392. }
  393. } else {
  394. _, err = w.Write(scanner.Bytes())
  395. if err != nil {
  396. ctx.GetLogger().Error(err)
  397. break
  398. }
  399. _, err = w.Write([]byte{'\n'})
  400. if err != nil {
  401. ctx.GetLogger().Error(err)
  402. break
  403. }
  404. }
  405. }
  406. ln++
  407. }
  408. }()
  409. return r, nil
  410. }
  411. return reader, nil
  412. }