file_source.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443
  1. // Copyright 2021-2023 EMQ Technologies Co., Ltd.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package file
  15. import (
  16. "bufio"
  17. "encoding/csv"
  18. "encoding/json"
  19. "errors"
  20. "fmt"
  21. "io"
  22. "os"
  23. "path/filepath"
  24. "strconv"
  25. "strings"
  26. "sync"
  27. "time"
  28. "github.com/lf-edge/ekuiper/internal/compressor"
  29. "github.com/lf-edge/ekuiper/internal/conf"
  30. "github.com/lf-edge/ekuiper/internal/xsql"
  31. "github.com/lf-edge/ekuiper/pkg/api"
  32. "github.com/lf-edge/ekuiper/pkg/cast"
  33. )
  34. type FileSourceConfig struct {
  35. FileType FileType `json:"fileType"`
  36. Path string `json:"path"`
  37. Interval int `json:"interval"`
  38. IsTable bool `json:"isTable"`
  39. Parallel bool `json:"parallel"`
  40. SendInterval int `json:"sendInterval"`
  41. ActionAfterRead int `json:"actionAfterRead"`
  42. MoveTo string `json:"moveTo"`
  43. HasHeader bool `json:"hasHeader"`
  44. Columns []string `json:"columns"`
  45. IgnoreStartLines int `json:"ignoreStartLines"`
  46. IgnoreEndLines int `json:"ignoreEndLines"`
  47. Delimiter string `json:"delimiter"`
  48. Decompression string `json:"decompression"`
  49. }
  50. // FileSource The BATCH to load data from file at once
  51. type FileSource struct {
  52. file string
  53. isDir bool
  54. config *FileSourceConfig
  55. }
  56. func (fs *FileSource) Close(ctx api.StreamContext) error {
  57. ctx.GetLogger().Infof("Close file source")
  58. // do nothing
  59. return nil
  60. }
  61. func (fs *FileSource) Configure(fileName string, props map[string]interface{}) error {
  62. cfg := &FileSourceConfig{
  63. FileType: JSON_TYPE,
  64. }
  65. err := cast.MapToStruct(props, cfg)
  66. if err != nil {
  67. return fmt.Errorf("read properties %v fail with error: %v", props, err)
  68. }
  69. if cfg.FileType == "" {
  70. return errors.New("missing or invalid property fileType, must be 'json'")
  71. }
  72. if _, ok := fileTypes[cfg.FileType]; !ok {
  73. return fmt.Errorf("invalid property fileType: %s", cfg.FileType)
  74. }
  75. if cfg.Path == "" {
  76. return errors.New("missing property Path")
  77. }
  78. if !filepath.IsAbs(cfg.Path) {
  79. cfg.Path, err = conf.GetLoc(cfg.Path)
  80. if err != nil {
  81. return fmt.Errorf("invalid path %s", cfg.Path)
  82. }
  83. }
  84. if fileName != "/$$TEST_CONNECTION$$" {
  85. fs.file = filepath.Join(cfg.Path, fileName)
  86. fi, err := os.Stat(fs.file)
  87. if err != nil {
  88. if os.IsNotExist(err) {
  89. return fmt.Errorf("file %s not exist", fs.file)
  90. }
  91. }
  92. if fi.IsDir() {
  93. fs.isDir = true
  94. }
  95. }
  96. if cfg.IgnoreStartLines < 0 {
  97. cfg.IgnoreStartLines = 0
  98. }
  99. if cfg.IgnoreEndLines < 0 {
  100. cfg.IgnoreEndLines = 0
  101. }
  102. if cfg.ActionAfterRead < 0 || cfg.ActionAfterRead > 2 {
  103. return fmt.Errorf("invalid actionAfterRead: %d", cfg.ActionAfterRead)
  104. }
  105. if cfg.ActionAfterRead == 2 {
  106. if cfg.MoveTo == "" {
  107. return fmt.Errorf("missing moveTo when actionAfterRead is 2")
  108. } else {
  109. if !filepath.IsAbs(cfg.MoveTo) {
  110. cfg.MoveTo, err = conf.GetLoc(cfg.MoveTo)
  111. if err != nil {
  112. return fmt.Errorf("invalid moveTo %s: %v", cfg.MoveTo, err)
  113. }
  114. }
  115. fileInfo, err := os.Stat(cfg.MoveTo)
  116. if err != nil {
  117. err := os.MkdirAll(cfg.MoveTo, os.ModePerm)
  118. if err != nil {
  119. return fmt.Errorf("fail to create dir for moveTo %s: %v", cfg.MoveTo, err)
  120. }
  121. } else if !fileInfo.IsDir() {
  122. return fmt.Errorf("moveTo %s is not a directory", cfg.MoveTo)
  123. }
  124. }
  125. }
  126. if cfg.Delimiter == "" {
  127. cfg.Delimiter = ","
  128. }
  129. if _, ok := compressionTypes[cfg.Decompression]; !ok && cfg.Decompression != "" {
  130. return fmt.Errorf("decompression must be one of gzip, zstd")
  131. }
  132. fs.config = cfg
  133. return nil
  134. }
  135. func (fs *FileSource) Open(ctx api.StreamContext, consumer chan<- api.SourceTuple, errCh chan<- error) {
  136. err := fs.Load(ctx, consumer)
  137. if err != nil {
  138. select {
  139. case consumer <- &xsql.ErrorSourceTuple{Error: err}:
  140. ctx.GetLogger().Errorf("find error when loading file %s with err %v", fs.file, err)
  141. case <-ctx.Done():
  142. return
  143. }
  144. }
  145. if fs.config.Interval > 0 {
  146. ticker := time.NewTicker(time.Millisecond * time.Duration(fs.config.Interval))
  147. logger := ctx.GetLogger()
  148. defer ticker.Stop()
  149. for {
  150. select {
  151. case <-ticker.C:
  152. logger.Debugf("Load file source again at %v", conf.GetNowInMilli())
  153. err := fs.Load(ctx, consumer)
  154. if err != nil {
  155. errCh <- err
  156. return
  157. }
  158. case <-ctx.Done():
  159. return
  160. }
  161. }
  162. }
  163. }
  164. func (fs *FileSource) Load(ctx api.StreamContext, consumer chan<- api.SourceTuple) error {
  165. rcvTime := conf.GetNow()
  166. if fs.isDir {
  167. ctx.GetLogger().Debugf("Monitor dir %s", fs.file)
  168. entries, err := os.ReadDir(fs.file)
  169. if err != nil {
  170. return err
  171. }
  172. if fs.config.Parallel {
  173. var wg sync.WaitGroup
  174. for _, entry := range entries {
  175. if entry.IsDir() {
  176. continue
  177. }
  178. wg.Add(1)
  179. go func(file string) {
  180. defer wg.Done()
  181. err := fs.parseFile(ctx, file, consumer)
  182. if err != nil {
  183. ctx.GetLogger().Errorf("Failed to parse file %s: %v", file, err)
  184. }
  185. }(filepath.Join(fs.file, entry.Name()))
  186. }
  187. wg.Wait()
  188. } else {
  189. for _, entry := range entries {
  190. if entry.IsDir() {
  191. continue
  192. }
  193. file := filepath.Join(fs.file, entry.Name())
  194. err := fs.parseFile(ctx, file, consumer)
  195. if err != nil {
  196. ctx.GetLogger().Errorf("parse file %s fail with error: %v", file, err)
  197. continue
  198. }
  199. }
  200. }
  201. } else {
  202. err := fs.parseFile(ctx, fs.file, consumer)
  203. if err != nil {
  204. return err
  205. }
  206. }
  207. // Send EOF if retain size not set if used in table
  208. if fs.config.IsTable {
  209. select {
  210. case consumer <- api.NewDefaultSourceTupleWithTime(nil, nil, rcvTime):
  211. // do nothing
  212. case <-ctx.Done():
  213. return nil
  214. }
  215. }
  216. ctx.GetLogger().Debug("All tuples sent")
  217. return nil
  218. }
  219. func (fs *FileSource) parseFile(ctx api.StreamContext, file string, consumer chan<- api.SourceTuple) (result error) {
  220. r, err := fs.prepareFile(ctx, file)
  221. if err != nil {
  222. ctx.GetLogger().Debugf("prepare file %s error: %v", file, err)
  223. return err
  224. }
  225. meta := map[string]interface{}{
  226. "file": file,
  227. }
  228. defer func() {
  229. ctx.GetLogger().Debugf("Finish loading from file %s", file)
  230. if closer, ok := r.(io.Closer); ok {
  231. ctx.GetLogger().Debugf("Close reader")
  232. closer.Close()
  233. }
  234. if result == nil {
  235. switch fs.config.ActionAfterRead {
  236. case 1:
  237. if err := os.Remove(file); err != nil {
  238. result = err
  239. }
  240. ctx.GetLogger().Debugf("Remove file %s", file)
  241. case 2:
  242. targetFile := filepath.Join(fs.config.MoveTo, filepath.Base(file))
  243. if err := os.Rename(file, targetFile); err != nil {
  244. result = err
  245. }
  246. ctx.GetLogger().Debugf("Move file %s to %s", file, targetFile)
  247. }
  248. }
  249. }()
  250. return fs.publish(ctx, r, consumer, meta)
  251. }
  252. func (fs *FileSource) publish(ctx api.StreamContext, file io.Reader, consumer chan<- api.SourceTuple, meta map[string]interface{}) error {
  253. ctx.GetLogger().Debug("Start to load")
  254. rcvTime := conf.GetNow()
  255. switch fs.config.FileType {
  256. case JSON_TYPE:
  257. r := json.NewDecoder(file)
  258. resultMap := make([]map[string]interface{}, 0)
  259. err := r.Decode(&resultMap)
  260. if err != nil {
  261. return fmt.Errorf("loaded %s, check error %s", fs.file, err)
  262. }
  263. ctx.GetLogger().Debug("Sending tuples")
  264. for _, m := range resultMap {
  265. select {
  266. case consumer <- api.NewDefaultSourceTupleWithTime(m, meta, rcvTime):
  267. case <-ctx.Done():
  268. return nil
  269. }
  270. if fs.config.SendInterval > 0 {
  271. time.Sleep(time.Millisecond * time.Duration(fs.config.SendInterval))
  272. }
  273. rcvTime = conf.GetNow()
  274. }
  275. return nil
  276. case CSV_TYPE:
  277. r := csv.NewReader(file)
  278. r.Comma = rune(fs.config.Delimiter[0])
  279. r.TrimLeadingSpace = true
  280. r.FieldsPerRecord = -1
  281. cols := fs.config.Columns
  282. if fs.config.HasHeader {
  283. var err error
  284. ctx.GetLogger().Debug("Has header")
  285. cols, err = r.Read()
  286. if err == io.EOF {
  287. break
  288. }
  289. if err != nil {
  290. ctx.GetLogger().Warnf("Read file %s encounter error: %v", fs.file, err)
  291. return err
  292. }
  293. ctx.GetLogger().Debugf("Got header %v", cols)
  294. }
  295. for {
  296. record, err := r.Read()
  297. if err == io.EOF {
  298. break
  299. }
  300. if err != nil {
  301. ctx.GetLogger().Warnf("Read file %s encounter error: %v", fs.file, err)
  302. continue
  303. }
  304. ctx.GetLogger().Debugf("Read" + strings.Join(record, ","))
  305. var m map[string]interface{}
  306. if cols == nil {
  307. m = make(map[string]interface{}, len(record))
  308. for i, v := range record {
  309. m["cols"+strconv.Itoa(i)] = v
  310. }
  311. } else {
  312. m = make(map[string]interface{}, len(cols))
  313. for i, v := range cols {
  314. m[v] = record[i]
  315. }
  316. }
  317. select {
  318. case consumer <- api.NewDefaultSourceTupleWithTime(m, meta, rcvTime):
  319. case <-ctx.Done():
  320. return nil
  321. }
  322. if fs.config.SendInterval > 0 {
  323. time.Sleep(time.Millisecond * time.Duration(fs.config.SendInterval))
  324. }
  325. rcvTime = conf.GetNow()
  326. }
  327. case LINES_TYPE:
  328. scanner := bufio.NewScanner(file)
  329. scanner.Split(bufio.ScanLines)
  330. for scanner.Scan() {
  331. var tuples []api.SourceTuple
  332. m, err := ctx.DecodeIntoList(scanner.Bytes())
  333. if err != nil {
  334. tuples = []api.SourceTuple{&xsql.ErrorSourceTuple{
  335. Error: fmt.Errorf("Invalid data format, cannot decode %s with error %s", scanner.Text(), err),
  336. }}
  337. } else {
  338. for _, t := range m {
  339. tuples = append(tuples, api.NewDefaultSourceTupleWithTime(t, meta, rcvTime))
  340. }
  341. }
  342. for _, tuple := range tuples {
  343. select {
  344. case consumer <- tuple:
  345. case <-ctx.Done():
  346. return nil
  347. }
  348. }
  349. if fs.config.SendInterval > 0 {
  350. time.Sleep(time.Millisecond * time.Duration(fs.config.SendInterval))
  351. }
  352. rcvTime = conf.GetNow()
  353. }
  354. default:
  355. return fmt.Errorf("invalid file type %s", fs.config.FileType)
  356. }
  357. return nil
  358. }
  359. // prepareFile prepare file by deleting ignore lines
  360. func (fs *FileSource) prepareFile(ctx api.StreamContext, file string) (io.Reader, error) {
  361. f, err := os.Open(file)
  362. if err != nil {
  363. ctx.GetLogger().Error(err)
  364. return nil, err
  365. }
  366. var reader io.ReadCloser
  367. if fs.config.Decompression != "" {
  368. reader, err = compressor.GetDecompressReader(fs.config.Decompression, f)
  369. if err != nil {
  370. return nil, err
  371. }
  372. } else {
  373. reader = f
  374. }
  375. if fs.config.IgnoreStartLines > 0 || fs.config.IgnoreEndLines > 0 {
  376. r, w := io.Pipe()
  377. go func() {
  378. defer func() {
  379. ctx.GetLogger().Debugf("Close pipe files %s", file)
  380. w.Close()
  381. reader.Close()
  382. }()
  383. scanner := bufio.NewScanner(reader)
  384. scanner.Split(bufio.ScanLines)
  385. ln := 0
  386. // This is a queue to store the lines that should be ignored
  387. tempLines := make([][]byte, 0, fs.config.IgnoreEndLines)
  388. for scanner.Scan() {
  389. if ln >= fs.config.IgnoreStartLines {
  390. if fs.config.IgnoreEndLines > 0 { // the last n line are left in the tempLines
  391. slot := (ln - fs.config.IgnoreStartLines) % fs.config.IgnoreEndLines
  392. if len(tempLines) <= slot { // first round
  393. tempLines = append(tempLines, scanner.Bytes())
  394. } else {
  395. _, err := w.Write(tempLines[slot])
  396. if err != nil {
  397. ctx.GetLogger().Error(err)
  398. break
  399. }
  400. _, err = w.Write([]byte{'\n'})
  401. if err != nil {
  402. ctx.GetLogger().Error(err)
  403. break
  404. }
  405. tempLines[slot] = scanner.Bytes()
  406. }
  407. } else {
  408. _, err = w.Write(scanner.Bytes())
  409. if err != nil {
  410. ctx.GetLogger().Error(err)
  411. break
  412. }
  413. _, err = w.Write([]byte{'\n'})
  414. if err != nil {
  415. ctx.GetLogger().Error(err)
  416. break
  417. }
  418. }
  419. }
  420. ln++
  421. }
  422. }()
  423. return r, nil
  424. }
  425. return reader, nil
  426. }