file_source.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411
  1. // Copyright 2021-2023 EMQ Technologies Co., Ltd.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package file
  15. import (
  16. "bufio"
  17. "encoding/csv"
  18. "encoding/json"
  19. "errors"
  20. "fmt"
  21. "github.com/lf-edge/ekuiper/internal/conf"
  22. "github.com/lf-edge/ekuiper/internal/xsql"
  23. "github.com/lf-edge/ekuiper/pkg/api"
  24. "github.com/lf-edge/ekuiper/pkg/cast"
  25. "io"
  26. "os"
  27. "path/filepath"
  28. "strconv"
  29. "strings"
  30. "time"
  31. )
  32. type FileType string
  33. const (
  34. JSON_TYPE FileType = "json"
  35. CSV_TYPE FileType = "csv"
  36. LINES_TYPE FileType = "lines"
  37. )
  38. var fileTypes = map[FileType]struct{}{
  39. JSON_TYPE: {},
  40. CSV_TYPE: {},
  41. LINES_TYPE: {},
  42. }
  43. type FileSourceConfig struct {
  44. FileType FileType `json:"fileType"`
  45. Path string `json:"path"`
  46. Interval int `json:"interval"`
  47. IsTable bool `json:"isTable"`
  48. SendInterval int `json:"sendInterval"`
  49. ActionAfterRead int `json:"actionAfterRead"`
  50. MoveTo string `json:"moveTo"`
  51. HasHeader bool `json:"hasHeader"`
  52. Columns []string `json:"columns"`
  53. IgnoreStartLines int `json:"ignoreStartLines"`
  54. IgnoreEndLines int `json:"ignoreEndLines"`
  55. Delimiter string `json:"delimiter"`
  56. }
  57. // FileSource The BATCH to load data from file at once
  58. type FileSource struct {
  59. file string
  60. isDir bool
  61. config *FileSourceConfig
  62. }
  63. func (fs *FileSource) Close(ctx api.StreamContext) error {
  64. ctx.GetLogger().Infof("Close file source")
  65. // do nothing
  66. return nil
  67. }
  68. func (fs *FileSource) Configure(fileName string, props map[string]interface{}) error {
  69. cfg := &FileSourceConfig{
  70. FileType: JSON_TYPE,
  71. }
  72. err := cast.MapToStruct(props, cfg)
  73. if err != nil {
  74. return fmt.Errorf("read properties %v fail with error: %v", props, err)
  75. }
  76. if cfg.FileType == "" {
  77. return errors.New("missing or invalid property fileType, must be 'json'")
  78. }
  79. if _, ok := fileTypes[cfg.FileType]; !ok {
  80. return fmt.Errorf("invalid property fileType: %s", cfg.FileType)
  81. }
  82. if cfg.Path == "" {
  83. return errors.New("missing property Path")
  84. }
  85. if !filepath.IsAbs(cfg.Path) {
  86. cfg.Path, err = conf.GetLoc(cfg.Path)
  87. if err != nil {
  88. return fmt.Errorf("invalid path %s", cfg.Path)
  89. }
  90. }
  91. if fileName != "/$$TEST_CONNECTION$$" {
  92. fs.file = filepath.Join(cfg.Path, fileName)
  93. fi, err := os.Stat(fs.file)
  94. if err != nil {
  95. if os.IsNotExist(err) {
  96. return fmt.Errorf("file %s not exist", fs.file)
  97. }
  98. }
  99. if fi.IsDir() {
  100. fs.isDir = true
  101. }
  102. }
  103. if cfg.IgnoreStartLines < 0 {
  104. cfg.IgnoreStartLines = 0
  105. }
  106. if cfg.IgnoreEndLines < 0 {
  107. cfg.IgnoreEndLines = 0
  108. }
  109. if cfg.ActionAfterRead < 0 || cfg.ActionAfterRead > 2 {
  110. return fmt.Errorf("invalid actionAfterRead: %d", cfg.ActionAfterRead)
  111. }
  112. if cfg.ActionAfterRead == 2 {
  113. if cfg.MoveTo == "" {
  114. return fmt.Errorf("missing moveTo when actionAfterRead is 2")
  115. } else {
  116. if !filepath.IsAbs(cfg.MoveTo) {
  117. cfg.MoveTo, err = conf.GetLoc(cfg.MoveTo)
  118. if err != nil {
  119. return fmt.Errorf("invalid moveTo %s: %v", cfg.MoveTo, err)
  120. }
  121. }
  122. fileInfo, err := os.Stat(cfg.MoveTo)
  123. if err != nil {
  124. err := os.MkdirAll(cfg.MoveTo, os.ModePerm)
  125. if err != nil {
  126. return fmt.Errorf("fail to create dir for moveTo %s: %v", cfg.MoveTo, err)
  127. }
  128. } else if !fileInfo.IsDir() {
  129. return fmt.Errorf("moveTo %s is not a directory", cfg.MoveTo)
  130. }
  131. }
  132. }
  133. if cfg.Delimiter == "" {
  134. cfg.Delimiter = ","
  135. }
  136. fs.config = cfg
  137. return nil
  138. }
  139. func (fs *FileSource) Open(ctx api.StreamContext, consumer chan<- api.SourceTuple, errCh chan<- error) {
  140. err := fs.Load(ctx, consumer)
  141. if err != nil {
  142. select {
  143. case consumer <- &xsql.ErrorSourceTuple{Error: err}:
  144. ctx.GetLogger().Errorf("find error when loading file %s with err %v", fs.file, err)
  145. case <-ctx.Done():
  146. return
  147. }
  148. }
  149. if fs.config.Interval > 0 {
  150. ticker := time.NewTicker(time.Millisecond * time.Duration(fs.config.Interval))
  151. logger := ctx.GetLogger()
  152. defer ticker.Stop()
  153. for {
  154. select {
  155. case <-ticker.C:
  156. logger.Debugf("Load file source again at %v", conf.GetNowInMilli())
  157. err := fs.Load(ctx, consumer)
  158. if err != nil {
  159. errCh <- err
  160. return
  161. }
  162. case <-ctx.Done():
  163. return
  164. }
  165. }
  166. }
  167. }
  168. func (fs *FileSource) Load(ctx api.StreamContext, consumer chan<- api.SourceTuple) error {
  169. if fs.isDir {
  170. ctx.GetLogger().Debugf("Monitor dir %s", fs.file)
  171. entries, err := os.ReadDir(fs.file)
  172. if err != nil {
  173. return err
  174. }
  175. for _, entry := range entries {
  176. if entry.IsDir() {
  177. continue
  178. }
  179. file := filepath.Join(fs.file, entry.Name())
  180. err := fs.parseFile(ctx, file, consumer)
  181. if err != nil {
  182. ctx.GetLogger().Errorf("parse file %s fail with error: %v", file, err)
  183. continue
  184. }
  185. }
  186. } else {
  187. err := fs.parseFile(ctx, fs.file, consumer)
  188. if err != nil {
  189. return err
  190. }
  191. }
  192. // Send EOF if retain size not set if used in table
  193. if fs.config.IsTable {
  194. select {
  195. case consumer <- api.NewDefaultSourceTuple(nil, nil):
  196. // do nothing
  197. case <-ctx.Done():
  198. return nil
  199. }
  200. }
  201. ctx.GetLogger().Debug("All tuples sent")
  202. return nil
  203. }
  204. func (fs *FileSource) parseFile(ctx api.StreamContext, file string, consumer chan<- api.SourceTuple) (result error) {
  205. r, err := fs.prepareFile(ctx, file)
  206. if err != nil {
  207. ctx.GetLogger().Debugf("prepare file %s error: %v", file, err)
  208. return err
  209. }
  210. meta := map[string]interface{}{
  211. "file": file,
  212. }
  213. defer func() {
  214. ctx.GetLogger().Debugf("Finish loading from file %s", file)
  215. if closer, ok := r.(io.Closer); ok {
  216. ctx.GetLogger().Debugf("Close reader")
  217. closer.Close()
  218. }
  219. if result == nil {
  220. switch fs.config.ActionAfterRead {
  221. case 1:
  222. if err := os.Remove(file); err != nil {
  223. result = err
  224. }
  225. ctx.GetLogger().Debugf("Remove file %s", file)
  226. case 2:
  227. targetFile := filepath.Join(fs.config.MoveTo, filepath.Base(file))
  228. if err := os.Rename(file, targetFile); err != nil {
  229. result = err
  230. }
  231. ctx.GetLogger().Debugf("Move file %s to %s", file, targetFile)
  232. }
  233. }
  234. }()
  235. if err := fs.publish(ctx, r, consumer, meta); err != nil {
  236. return err
  237. }
  238. return nil
  239. }
  240. func (fs *FileSource) publish(ctx api.StreamContext, file io.Reader, consumer chan<- api.SourceTuple, meta map[string]interface{}) error {
  241. ctx.GetLogger().Debug("Start to load")
  242. switch fs.config.FileType {
  243. case JSON_TYPE:
  244. r := json.NewDecoder(file)
  245. resultMap := make([]map[string]interface{}, 0)
  246. err := r.Decode(&resultMap)
  247. if err != nil {
  248. return fmt.Errorf("loaded %s, check error %s", fs.file, err)
  249. }
  250. ctx.GetLogger().Debug("Sending tuples")
  251. for _, m := range resultMap {
  252. select {
  253. case consumer <- api.NewDefaultSourceTuple(m, meta):
  254. case <-ctx.Done():
  255. return nil
  256. }
  257. if fs.config.SendInterval > 0 {
  258. time.Sleep(time.Millisecond * time.Duration(fs.config.SendInterval))
  259. }
  260. }
  261. return nil
  262. case CSV_TYPE:
  263. r := csv.NewReader(file)
  264. r.Comma = rune(fs.config.Delimiter[0])
  265. r.TrimLeadingSpace = true
  266. r.FieldsPerRecord = -1
  267. cols := fs.config.Columns
  268. if fs.config.HasHeader {
  269. var err error
  270. ctx.GetLogger().Debug("Has header")
  271. cols, err = r.Read()
  272. if err == io.EOF {
  273. break
  274. }
  275. if err != nil {
  276. ctx.GetLogger().Warnf("Read file %s encounter error: %v", fs.file, err)
  277. return err
  278. }
  279. ctx.GetLogger().Debugf("Got header %v", cols)
  280. }
  281. for {
  282. record, err := r.Read()
  283. if err == io.EOF {
  284. break
  285. }
  286. if err != nil {
  287. ctx.GetLogger().Warnf("Read file %s encounter error: %v", fs.file, err)
  288. continue
  289. }
  290. ctx.GetLogger().Debugf("Read" + strings.Join(record, ","))
  291. var m map[string]interface{}
  292. if cols == nil {
  293. m = make(map[string]interface{}, len(record))
  294. for i, v := range record {
  295. m["cols"+strconv.Itoa(i)] = v
  296. }
  297. } else {
  298. m = make(map[string]interface{}, len(cols))
  299. for i, v := range cols {
  300. m[v] = record[i]
  301. }
  302. }
  303. select {
  304. case consumer <- api.NewDefaultSourceTuple(m, meta):
  305. case <-ctx.Done():
  306. return nil
  307. }
  308. if fs.config.SendInterval > 0 {
  309. time.Sleep(time.Millisecond * time.Duration(fs.config.SendInterval))
  310. }
  311. }
  312. case LINES_TYPE:
  313. scanner := bufio.NewScanner(file)
  314. scanner.Split(bufio.ScanLines)
  315. for scanner.Scan() {
  316. var tuple api.SourceTuple
  317. m, err := ctx.Decode(scanner.Bytes())
  318. if err != nil {
  319. tuple = &xsql.ErrorSourceTuple{
  320. Error: fmt.Errorf("Invalid data format, cannot decode %s with error %s", scanner.Text(), err),
  321. }
  322. } else {
  323. tuple = api.NewDefaultSourceTuple(m, meta)
  324. }
  325. select {
  326. case consumer <- tuple:
  327. case <-ctx.Done():
  328. return nil
  329. }
  330. if fs.config.SendInterval > 0 {
  331. time.Sleep(time.Millisecond * time.Duration(fs.config.SendInterval))
  332. }
  333. }
  334. default:
  335. return fmt.Errorf("invalid file type %s", fs.config.FileType)
  336. }
  337. return nil
  338. }
  339. // prepareFile prepare file by deleting ignore lines
  340. func (fs *FileSource) prepareFile(ctx api.StreamContext, file string) (io.Reader, error) {
  341. f, err := os.Open(file)
  342. if err != nil {
  343. ctx.GetLogger().Error(err)
  344. return nil, err
  345. }
  346. if fs.config.IgnoreStartLines > 0 || fs.config.IgnoreEndLines > 0 {
  347. r, w := io.Pipe()
  348. go func() {
  349. defer func() {
  350. ctx.GetLogger().Debugf("Close pipe files %s", file)
  351. w.Close()
  352. f.Close()
  353. }()
  354. scanner := bufio.NewScanner(f)
  355. scanner.Split(bufio.ScanLines)
  356. ln := 0
  357. // This is a queue to store the lines that should be ignored
  358. tempLines := make([]string, 0, fs.config.IgnoreEndLines)
  359. for scanner.Scan() {
  360. if ln >= fs.config.IgnoreStartLines {
  361. if fs.config.IgnoreEndLines > 0 { // the last n line are left in the tempLines
  362. slot := (ln - fs.config.IgnoreStartLines) % fs.config.IgnoreEndLines
  363. if len(tempLines) <= slot { // first round
  364. tempLines = append(tempLines, scanner.Text())
  365. } else {
  366. _, err := w.Write([]byte(tempLines[slot]))
  367. if err != nil {
  368. ctx.GetLogger().Error(err)
  369. break
  370. }
  371. _, err = w.Write([]byte{'\n'})
  372. if err != nil {
  373. ctx.GetLogger().Error(err)
  374. break
  375. }
  376. tempLines[slot] = scanner.Text()
  377. }
  378. } else {
  379. _, err = w.Write(scanner.Bytes())
  380. if err != nil {
  381. ctx.GetLogger().Error(err)
  382. break
  383. }
  384. _, err = w.Write([]byte{'\n'})
  385. if err != nil {
  386. ctx.GetLogger().Error(err)
  387. break
  388. }
  389. }
  390. }
  391. ln++
  392. }
  393. }()
  394. return r, nil
  395. }
  396. return f, nil
  397. }