file_source.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420
  1. // Copyright 2021-2023 EMQ Technologies Co., Ltd.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package file
  15. import (
  16. "bufio"
  17. "encoding/csv"
  18. "encoding/json"
  19. "errors"
  20. "fmt"
  21. "io"
  22. "os"
  23. "path/filepath"
  24. "strconv"
  25. "strings"
  26. "time"
  27. "github.com/lf-edge/ekuiper/internal/compressor"
  28. "github.com/lf-edge/ekuiper/internal/conf"
  29. "github.com/lf-edge/ekuiper/internal/xsql"
  30. "github.com/lf-edge/ekuiper/pkg/api"
  31. "github.com/lf-edge/ekuiper/pkg/cast"
  32. )
  33. type FileSourceConfig struct {
  34. FileType FileType `json:"fileType"`
  35. Path string `json:"path"`
  36. Interval int `json:"interval"`
  37. IsTable bool `json:"isTable"`
  38. SendInterval int `json:"sendInterval"`
  39. ActionAfterRead int `json:"actionAfterRead"`
  40. MoveTo string `json:"moveTo"`
  41. HasHeader bool `json:"hasHeader"`
  42. Columns []string `json:"columns"`
  43. IgnoreStartLines int `json:"ignoreStartLines"`
  44. IgnoreEndLines int `json:"ignoreEndLines"`
  45. Delimiter string `json:"delimiter"`
  46. Decompression string `json:"decompression"`
  47. }
  48. // FileSource The BATCH to load data from file at once
  49. type FileSource struct {
  50. file string
  51. isDir bool
  52. config *FileSourceConfig
  53. }
  54. func (fs *FileSource) Close(ctx api.StreamContext) error {
  55. ctx.GetLogger().Infof("Close file source")
  56. // do nothing
  57. return nil
  58. }
  59. func (fs *FileSource) Configure(fileName string, props map[string]interface{}) error {
  60. cfg := &FileSourceConfig{
  61. FileType: JSON_TYPE,
  62. }
  63. err := cast.MapToStruct(props, cfg)
  64. if err != nil {
  65. return fmt.Errorf("read properties %v fail with error: %v", props, err)
  66. }
  67. if cfg.FileType == "" {
  68. return errors.New("missing or invalid property fileType, must be 'json'")
  69. }
  70. if _, ok := fileTypes[cfg.FileType]; !ok {
  71. return fmt.Errorf("invalid property fileType: %s", cfg.FileType)
  72. }
  73. if cfg.Path == "" {
  74. return errors.New("missing property Path")
  75. }
  76. if !filepath.IsAbs(cfg.Path) {
  77. cfg.Path, err = conf.GetLoc(cfg.Path)
  78. if err != nil {
  79. return fmt.Errorf("invalid path %s", cfg.Path)
  80. }
  81. }
  82. if fileName != "/$$TEST_CONNECTION$$" {
  83. fs.file = filepath.Join(cfg.Path, fileName)
  84. fi, err := os.Stat(fs.file)
  85. if err != nil {
  86. if os.IsNotExist(err) {
  87. return fmt.Errorf("file %s not exist", fs.file)
  88. }
  89. }
  90. if fi.IsDir() {
  91. fs.isDir = true
  92. }
  93. }
  94. if cfg.IgnoreStartLines < 0 {
  95. cfg.IgnoreStartLines = 0
  96. }
  97. if cfg.IgnoreEndLines < 0 {
  98. cfg.IgnoreEndLines = 0
  99. }
  100. if cfg.ActionAfterRead < 0 || cfg.ActionAfterRead > 2 {
  101. return fmt.Errorf("invalid actionAfterRead: %d", cfg.ActionAfterRead)
  102. }
  103. if cfg.ActionAfterRead == 2 {
  104. if cfg.MoveTo == "" {
  105. return fmt.Errorf("missing moveTo when actionAfterRead is 2")
  106. } else {
  107. if !filepath.IsAbs(cfg.MoveTo) {
  108. cfg.MoveTo, err = conf.GetLoc(cfg.MoveTo)
  109. if err != nil {
  110. return fmt.Errorf("invalid moveTo %s: %v", cfg.MoveTo, err)
  111. }
  112. }
  113. fileInfo, err := os.Stat(cfg.MoveTo)
  114. if err != nil {
  115. err := os.MkdirAll(cfg.MoveTo, os.ModePerm)
  116. if err != nil {
  117. return fmt.Errorf("fail to create dir for moveTo %s: %v", cfg.MoveTo, err)
  118. }
  119. } else if !fileInfo.IsDir() {
  120. return fmt.Errorf("moveTo %s is not a directory", cfg.MoveTo)
  121. }
  122. }
  123. }
  124. if cfg.Delimiter == "" {
  125. cfg.Delimiter = ","
  126. }
  127. if _, ok := compressionTypes[cfg.Decompression]; !ok && cfg.Decompression != "" {
  128. return fmt.Errorf("decompression must be one of gzip, zstd")
  129. }
  130. fs.config = cfg
  131. return nil
  132. }
  133. func (fs *FileSource) Open(ctx api.StreamContext, consumer chan<- api.SourceTuple, errCh chan<- error) {
  134. err := fs.Load(ctx, consumer)
  135. if err != nil {
  136. select {
  137. case consumer <- &xsql.ErrorSourceTuple{Error: err}:
  138. ctx.GetLogger().Errorf("find error when loading file %s with err %v", fs.file, err)
  139. case <-ctx.Done():
  140. return
  141. }
  142. }
  143. if fs.config.Interval > 0 {
  144. ticker := time.NewTicker(time.Millisecond * time.Duration(fs.config.Interval))
  145. logger := ctx.GetLogger()
  146. defer ticker.Stop()
  147. for {
  148. select {
  149. case <-ticker.C:
  150. logger.Debugf("Load file source again at %v", conf.GetNowInMilli())
  151. err := fs.Load(ctx, consumer)
  152. if err != nil {
  153. errCh <- err
  154. return
  155. }
  156. case <-ctx.Done():
  157. return
  158. }
  159. }
  160. }
  161. }
  162. func (fs *FileSource) Load(ctx api.StreamContext, consumer chan<- api.SourceTuple) error {
  163. rcvTime := conf.GetNow()
  164. if fs.isDir {
  165. ctx.GetLogger().Debugf("Monitor dir %s", fs.file)
  166. entries, err := os.ReadDir(fs.file)
  167. if err != nil {
  168. return err
  169. }
  170. for _, entry := range entries {
  171. if entry.IsDir() {
  172. continue
  173. }
  174. file := filepath.Join(fs.file, entry.Name())
  175. err := fs.parseFile(ctx, file, consumer)
  176. if err != nil {
  177. ctx.GetLogger().Errorf("parse file %s fail with error: %v", file, err)
  178. continue
  179. }
  180. }
  181. } else {
  182. err := fs.parseFile(ctx, fs.file, consumer)
  183. if err != nil {
  184. return err
  185. }
  186. }
  187. // Send EOF if retain size not set if used in table
  188. if fs.config.IsTable {
  189. select {
  190. case consumer <- api.NewDefaultSourceTupleWithTime(nil, nil, rcvTime):
  191. // do nothing
  192. case <-ctx.Done():
  193. return nil
  194. }
  195. }
  196. ctx.GetLogger().Debug("All tuples sent")
  197. return nil
  198. }
  199. func (fs *FileSource) parseFile(ctx api.StreamContext, file string, consumer chan<- api.SourceTuple) (result error) {
  200. r, err := fs.prepareFile(ctx, file)
  201. if err != nil {
  202. ctx.GetLogger().Debugf("prepare file %s error: %v", file, err)
  203. return err
  204. }
  205. meta := map[string]interface{}{
  206. "file": file,
  207. }
  208. defer func() {
  209. ctx.GetLogger().Debugf("Finish loading from file %s", file)
  210. if closer, ok := r.(io.Closer); ok {
  211. ctx.GetLogger().Debugf("Close reader")
  212. closer.Close()
  213. }
  214. if result == nil {
  215. switch fs.config.ActionAfterRead {
  216. case 1:
  217. if err := os.Remove(file); err != nil {
  218. result = err
  219. }
  220. ctx.GetLogger().Debugf("Remove file %s", file)
  221. case 2:
  222. targetFile := filepath.Join(fs.config.MoveTo, filepath.Base(file))
  223. if err := os.Rename(file, targetFile); err != nil {
  224. result = err
  225. }
  226. ctx.GetLogger().Debugf("Move file %s to %s", file, targetFile)
  227. }
  228. }
  229. }()
  230. return fs.publish(ctx, r, consumer, meta)
  231. }
  232. func (fs *FileSource) publish(ctx api.StreamContext, file io.Reader, consumer chan<- api.SourceTuple, meta map[string]interface{}) error {
  233. ctx.GetLogger().Debug("Start to load")
  234. rcvTime := conf.GetNow()
  235. switch fs.config.FileType {
  236. case JSON_TYPE:
  237. r := json.NewDecoder(file)
  238. resultMap := make([]map[string]interface{}, 0)
  239. err := r.Decode(&resultMap)
  240. if err != nil {
  241. return fmt.Errorf("loaded %s, check error %s", fs.file, err)
  242. }
  243. ctx.GetLogger().Debug("Sending tuples")
  244. for _, m := range resultMap {
  245. select {
  246. case consumer <- api.NewDefaultSourceTupleWithTime(m, meta, rcvTime):
  247. case <-ctx.Done():
  248. return nil
  249. }
  250. if fs.config.SendInterval > 0 {
  251. time.Sleep(time.Millisecond * time.Duration(fs.config.SendInterval))
  252. }
  253. }
  254. return nil
  255. case CSV_TYPE:
  256. r := csv.NewReader(file)
  257. r.Comma = rune(fs.config.Delimiter[0])
  258. r.TrimLeadingSpace = true
  259. r.FieldsPerRecord = -1
  260. cols := fs.config.Columns
  261. if fs.config.HasHeader {
  262. var err error
  263. ctx.GetLogger().Debug("Has header")
  264. cols, err = r.Read()
  265. if err == io.EOF {
  266. break
  267. }
  268. if err != nil {
  269. ctx.GetLogger().Warnf("Read file %s encounter error: %v", fs.file, err)
  270. return err
  271. }
  272. ctx.GetLogger().Debugf("Got header %v", cols)
  273. }
  274. for {
  275. record, err := r.Read()
  276. if err == io.EOF {
  277. break
  278. }
  279. if err != nil {
  280. ctx.GetLogger().Warnf("Read file %s encounter error: %v", fs.file, err)
  281. continue
  282. }
  283. ctx.GetLogger().Debugf("Read" + strings.Join(record, ","))
  284. var m map[string]interface{}
  285. if cols == nil {
  286. m = make(map[string]interface{}, len(record))
  287. for i, v := range record {
  288. m["cols"+strconv.Itoa(i)] = v
  289. }
  290. } else {
  291. m = make(map[string]interface{}, len(cols))
  292. for i, v := range cols {
  293. m[v] = record[i]
  294. }
  295. }
  296. select {
  297. case consumer <- api.NewDefaultSourceTupleWithTime(m, meta, rcvTime):
  298. case <-ctx.Done():
  299. return nil
  300. }
  301. if fs.config.SendInterval > 0 {
  302. time.Sleep(time.Millisecond * time.Duration(fs.config.SendInterval))
  303. }
  304. }
  305. case LINES_TYPE:
  306. scanner := bufio.NewScanner(file)
  307. scanner.Split(bufio.ScanLines)
  308. for scanner.Scan() {
  309. var tuples []api.SourceTuple
  310. m, err := ctx.DecodeIntoList(scanner.Bytes())
  311. if err != nil {
  312. tuples = []api.SourceTuple{&xsql.ErrorSourceTuple{
  313. Error: fmt.Errorf("Invalid data format, cannot decode %s with error %s", scanner.Text(), err),
  314. }}
  315. } else {
  316. for _, t := range m {
  317. tuples = append(tuples, api.NewDefaultSourceTupleWithTime(t, meta, rcvTime))
  318. }
  319. }
  320. for _, tuple := range tuples {
  321. select {
  322. case consumer <- tuple:
  323. case <-ctx.Done():
  324. return nil
  325. }
  326. }
  327. if fs.config.SendInterval > 0 {
  328. time.Sleep(time.Millisecond * time.Duration(fs.config.SendInterval))
  329. }
  330. }
  331. default:
  332. return fmt.Errorf("invalid file type %s", fs.config.FileType)
  333. }
  334. return nil
  335. }
  336. // prepareFile prepare file by deleting ignore lines
  337. func (fs *FileSource) prepareFile(ctx api.StreamContext, file string) (io.Reader, error) {
  338. f, err := os.Open(file)
  339. if err != nil {
  340. ctx.GetLogger().Error(err)
  341. return nil, err
  342. }
  343. var reader io.ReadCloser
  344. if fs.config.Decompression != "" {
  345. reader, err = compressor.GetDecompressReader(fs.config.Decompression, f)
  346. if err != nil {
  347. return nil, err
  348. }
  349. } else {
  350. reader = f
  351. }
  352. if fs.config.IgnoreStartLines > 0 || fs.config.IgnoreEndLines > 0 {
  353. r, w := io.Pipe()
  354. go func() {
  355. defer func() {
  356. ctx.GetLogger().Debugf("Close pipe files %s", file)
  357. w.Close()
  358. reader.Close()
  359. }()
  360. scanner := bufio.NewScanner(reader)
  361. scanner.Split(bufio.ScanLines)
  362. ln := 0
  363. // This is a queue to store the lines that should be ignored
  364. tempLines := make([]string, 0, fs.config.IgnoreEndLines)
  365. for scanner.Scan() {
  366. if ln >= fs.config.IgnoreStartLines {
  367. if fs.config.IgnoreEndLines > 0 { // the last n line are left in the tempLines
  368. slot := (ln - fs.config.IgnoreStartLines) % fs.config.IgnoreEndLines
  369. if len(tempLines) <= slot { // first round
  370. tempLines = append(tempLines, scanner.Text())
  371. } else {
  372. _, err := w.Write([]byte(tempLines[slot]))
  373. if err != nil {
  374. ctx.GetLogger().Error(err)
  375. break
  376. }
  377. _, err = w.Write([]byte{'\n'})
  378. if err != nil {
  379. ctx.GetLogger().Error(err)
  380. break
  381. }
  382. tempLines[slot] = scanner.Text()
  383. }
  384. } else {
  385. _, err = w.Write(scanner.Bytes())
  386. if err != nil {
  387. ctx.GetLogger().Error(err)
  388. break
  389. }
  390. _, err = w.Write([]byte{'\n'})
  391. if err != nil {
  392. ctx.GetLogger().Error(err)
  393. break
  394. }
  395. }
  396. }
  397. ln++
  398. }
  399. }()
  400. return r, nil
  401. }
  402. return reader, nil
  403. }