file_source.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394
  1. // Copyright 2021-2023 EMQ Technologies Co., Ltd.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package file
  15. import (
  16. "bufio"
  17. "encoding/csv"
  18. "encoding/json"
  19. "errors"
  20. "fmt"
  21. "github.com/lf-edge/ekuiper/internal/conf"
  22. "github.com/lf-edge/ekuiper/internal/xsql"
  23. "github.com/lf-edge/ekuiper/pkg/api"
  24. "github.com/lf-edge/ekuiper/pkg/cast"
  25. "io"
  26. "os"
  27. "path/filepath"
  28. "strconv"
  29. "strings"
  30. "time"
  31. )
  32. type FileSourceConfig struct {
  33. FileType FileType `json:"fileType"`
  34. Path string `json:"path"`
  35. Interval int `json:"interval"`
  36. IsTable bool `json:"isTable"`
  37. SendInterval int `json:"sendInterval"`
  38. ActionAfterRead int `json:"actionAfterRead"`
  39. MoveTo string `json:"moveTo"`
  40. HasHeader bool `json:"hasHeader"`
  41. Columns []string `json:"columns"`
  42. IgnoreStartLines int `json:"ignoreStartLines"`
  43. IgnoreEndLines int `json:"ignoreEndLines"`
  44. Delimiter string `json:"delimiter"`
  45. }
  46. // FileSource The BATCH to load data from file at once
  47. type FileSource struct {
  48. file string
  49. isDir bool
  50. config *FileSourceConfig
  51. }
  52. func (fs *FileSource) Close(ctx api.StreamContext) error {
  53. ctx.GetLogger().Infof("Close file source")
  54. // do nothing
  55. return nil
  56. }
  57. func (fs *FileSource) Configure(fileName string, props map[string]interface{}) error {
  58. cfg := &FileSourceConfig{
  59. FileType: JSON_TYPE,
  60. }
  61. err := cast.MapToStruct(props, cfg)
  62. if err != nil {
  63. return fmt.Errorf("read properties %v fail with error: %v", props, err)
  64. }
  65. if cfg.FileType == "" {
  66. return errors.New("missing or invalid property fileType, must be 'json'")
  67. }
  68. if _, ok := fileTypes[cfg.FileType]; !ok {
  69. return fmt.Errorf("invalid property fileType: %s", cfg.FileType)
  70. }
  71. if cfg.Path == "" {
  72. return errors.New("missing property Path")
  73. }
  74. if !filepath.IsAbs(cfg.Path) {
  75. cfg.Path, err = conf.GetLoc(cfg.Path)
  76. if err != nil {
  77. return fmt.Errorf("invalid path %s", cfg.Path)
  78. }
  79. }
  80. if fileName != "/$$TEST_CONNECTION$$" {
  81. fs.file = filepath.Join(cfg.Path, fileName)
  82. fi, err := os.Stat(fs.file)
  83. if err != nil {
  84. if os.IsNotExist(err) {
  85. return fmt.Errorf("file %s not exist", fs.file)
  86. }
  87. }
  88. if fi.IsDir() {
  89. fs.isDir = true
  90. }
  91. }
  92. if cfg.IgnoreStartLines < 0 {
  93. cfg.IgnoreStartLines = 0
  94. }
  95. if cfg.IgnoreEndLines < 0 {
  96. cfg.IgnoreEndLines = 0
  97. }
  98. if cfg.ActionAfterRead < 0 || cfg.ActionAfterRead > 2 {
  99. return fmt.Errorf("invalid actionAfterRead: %d", cfg.ActionAfterRead)
  100. }
  101. if cfg.ActionAfterRead == 2 {
  102. if cfg.MoveTo == "" {
  103. return fmt.Errorf("missing moveTo when actionAfterRead is 2")
  104. } else {
  105. if !filepath.IsAbs(cfg.MoveTo) {
  106. cfg.MoveTo, err = conf.GetLoc(cfg.MoveTo)
  107. if err != nil {
  108. return fmt.Errorf("invalid moveTo %s: %v", cfg.MoveTo, err)
  109. }
  110. }
  111. fileInfo, err := os.Stat(cfg.MoveTo)
  112. if err != nil {
  113. err := os.MkdirAll(cfg.MoveTo, os.ModePerm)
  114. if err != nil {
  115. return fmt.Errorf("fail to create dir for moveTo %s: %v", cfg.MoveTo, err)
  116. }
  117. } else if !fileInfo.IsDir() {
  118. return fmt.Errorf("moveTo %s is not a directory", cfg.MoveTo)
  119. }
  120. }
  121. }
  122. if cfg.Delimiter == "" {
  123. cfg.Delimiter = ","
  124. }
  125. fs.config = cfg
  126. return nil
  127. }
  128. func (fs *FileSource) Open(ctx api.StreamContext, consumer chan<- api.SourceTuple, errCh chan<- error) {
  129. err := fs.Load(ctx, consumer)
  130. if err != nil {
  131. select {
  132. case consumer <- &xsql.ErrorSourceTuple{Error: err}:
  133. ctx.GetLogger().Errorf("find error when loading file %s with err %v", fs.file, err)
  134. case <-ctx.Done():
  135. return
  136. }
  137. }
  138. if fs.config.Interval > 0 {
  139. ticker := time.NewTicker(time.Millisecond * time.Duration(fs.config.Interval))
  140. logger := ctx.GetLogger()
  141. defer ticker.Stop()
  142. for {
  143. select {
  144. case <-ticker.C:
  145. logger.Debugf("Load file source again at %v", conf.GetNowInMilli())
  146. err := fs.Load(ctx, consumer)
  147. if err != nil {
  148. errCh <- err
  149. return
  150. }
  151. case <-ctx.Done():
  152. return
  153. }
  154. }
  155. }
  156. }
  157. func (fs *FileSource) Load(ctx api.StreamContext, consumer chan<- api.SourceTuple) error {
  158. if fs.isDir {
  159. ctx.GetLogger().Debugf("Monitor dir %s", fs.file)
  160. entries, err := os.ReadDir(fs.file)
  161. if err != nil {
  162. return err
  163. }
  164. for _, entry := range entries {
  165. if entry.IsDir() {
  166. continue
  167. }
  168. file := filepath.Join(fs.file, entry.Name())
  169. err := fs.parseFile(ctx, file, consumer)
  170. if err != nil {
  171. ctx.GetLogger().Errorf("parse file %s fail with error: %v", file, err)
  172. continue
  173. }
  174. }
  175. } else {
  176. err := fs.parseFile(ctx, fs.file, consumer)
  177. if err != nil {
  178. return err
  179. }
  180. }
  181. // Send EOF if retain size not set if used in table
  182. if fs.config.IsTable {
  183. select {
  184. case consumer <- api.NewDefaultSourceTuple(nil, nil):
  185. // do nothing
  186. case <-ctx.Done():
  187. return nil
  188. }
  189. }
  190. ctx.GetLogger().Debug("All tuples sent")
  191. return nil
  192. }
  193. func (fs *FileSource) parseFile(ctx api.StreamContext, file string, consumer chan<- api.SourceTuple) (result error) {
  194. r, err := fs.prepareFile(ctx, file)
  195. if err != nil {
  196. ctx.GetLogger().Debugf("prepare file %s error: %v", file, err)
  197. return err
  198. }
  199. meta := map[string]interface{}{
  200. "file": file,
  201. }
  202. defer func() {
  203. ctx.GetLogger().Debugf("Finish loading from file %s", file)
  204. if closer, ok := r.(io.Closer); ok {
  205. ctx.GetLogger().Debugf("Close reader")
  206. closer.Close()
  207. }
  208. if result == nil {
  209. switch fs.config.ActionAfterRead {
  210. case 1:
  211. if err := os.Remove(file); err != nil {
  212. result = err
  213. }
  214. ctx.GetLogger().Debugf("Remove file %s", file)
  215. case 2:
  216. targetFile := filepath.Join(fs.config.MoveTo, filepath.Base(file))
  217. if err := os.Rename(file, targetFile); err != nil {
  218. result = err
  219. }
  220. ctx.GetLogger().Debugf("Move file %s to %s", file, targetFile)
  221. }
  222. }
  223. }()
  224. return fs.publish(ctx, r, consumer, meta)
  225. }
  226. func (fs *FileSource) publish(ctx api.StreamContext, file io.Reader, consumer chan<- api.SourceTuple, meta map[string]interface{}) error {
  227. ctx.GetLogger().Debug("Start to load")
  228. switch fs.config.FileType {
  229. case JSON_TYPE:
  230. r := json.NewDecoder(file)
  231. resultMap := make([]map[string]interface{}, 0)
  232. err := r.Decode(&resultMap)
  233. if err != nil {
  234. return fmt.Errorf("loaded %s, check error %s", fs.file, err)
  235. }
  236. ctx.GetLogger().Debug("Sending tuples")
  237. for _, m := range resultMap {
  238. select {
  239. case consumer <- api.NewDefaultSourceTuple(m, meta):
  240. case <-ctx.Done():
  241. return nil
  242. }
  243. if fs.config.SendInterval > 0 {
  244. time.Sleep(time.Millisecond * time.Duration(fs.config.SendInterval))
  245. }
  246. }
  247. return nil
  248. case CSV_TYPE:
  249. r := csv.NewReader(file)
  250. r.Comma = rune(fs.config.Delimiter[0])
  251. r.TrimLeadingSpace = true
  252. r.FieldsPerRecord = -1
  253. cols := fs.config.Columns
  254. if fs.config.HasHeader {
  255. var err error
  256. ctx.GetLogger().Debug("Has header")
  257. cols, err = r.Read()
  258. if err == io.EOF {
  259. break
  260. }
  261. if err != nil {
  262. ctx.GetLogger().Warnf("Read file %s encounter error: %v", fs.file, err)
  263. return err
  264. }
  265. ctx.GetLogger().Debugf("Got header %v", cols)
  266. }
  267. for {
  268. record, err := r.Read()
  269. if err == io.EOF {
  270. break
  271. }
  272. if err != nil {
  273. ctx.GetLogger().Warnf("Read file %s encounter error: %v", fs.file, err)
  274. continue
  275. }
  276. ctx.GetLogger().Debugf("Read" + strings.Join(record, ","))
  277. var m map[string]interface{}
  278. if cols == nil {
  279. m = make(map[string]interface{}, len(record))
  280. for i, v := range record {
  281. m["cols"+strconv.Itoa(i)] = v
  282. }
  283. } else {
  284. m = make(map[string]interface{}, len(cols))
  285. for i, v := range cols {
  286. m[v] = record[i]
  287. }
  288. }
  289. select {
  290. case consumer <- api.NewDefaultSourceTuple(m, meta):
  291. case <-ctx.Done():
  292. return nil
  293. }
  294. if fs.config.SendInterval > 0 {
  295. time.Sleep(time.Millisecond * time.Duration(fs.config.SendInterval))
  296. }
  297. }
  298. case LINES_TYPE:
  299. scanner := bufio.NewScanner(file)
  300. scanner.Split(bufio.ScanLines)
  301. for scanner.Scan() {
  302. var tuple api.SourceTuple
  303. m, err := ctx.Decode(scanner.Bytes())
  304. if err != nil {
  305. tuple = &xsql.ErrorSourceTuple{
  306. Error: fmt.Errorf("Invalid data format, cannot decode %s with error %s", scanner.Text(), err),
  307. }
  308. } else {
  309. tuple = api.NewDefaultSourceTuple(m, meta)
  310. }
  311. select {
  312. case consumer <- tuple:
  313. case <-ctx.Done():
  314. return nil
  315. }
  316. if fs.config.SendInterval > 0 {
  317. time.Sleep(time.Millisecond * time.Duration(fs.config.SendInterval))
  318. }
  319. }
  320. default:
  321. return fmt.Errorf("invalid file type %s", fs.config.FileType)
  322. }
  323. return nil
  324. }
  325. // prepareFile prepare file by deleting ignore lines
  326. func (fs *FileSource) prepareFile(ctx api.StreamContext, file string) (io.Reader, error) {
  327. f, err := os.Open(file)
  328. if err != nil {
  329. ctx.GetLogger().Error(err)
  330. return nil, err
  331. }
  332. if fs.config.IgnoreStartLines > 0 || fs.config.IgnoreEndLines > 0 {
  333. r, w := io.Pipe()
  334. go func() {
  335. defer func() {
  336. ctx.GetLogger().Debugf("Close pipe files %s", file)
  337. w.Close()
  338. f.Close()
  339. }()
  340. scanner := bufio.NewScanner(f)
  341. scanner.Split(bufio.ScanLines)
  342. ln := 0
  343. // This is a queue to store the lines that should be ignored
  344. tempLines := make([]string, 0, fs.config.IgnoreEndLines)
  345. for scanner.Scan() {
  346. if ln >= fs.config.IgnoreStartLines {
  347. if fs.config.IgnoreEndLines > 0 { // the last n line are left in the tempLines
  348. slot := (ln - fs.config.IgnoreStartLines) % fs.config.IgnoreEndLines
  349. if len(tempLines) <= slot { // first round
  350. tempLines = append(tempLines, scanner.Text())
  351. } else {
  352. _, err := w.Write([]byte(tempLines[slot]))
  353. if err != nil {
  354. ctx.GetLogger().Error(err)
  355. break
  356. }
  357. _, err = w.Write([]byte{'\n'})
  358. if err != nil {
  359. ctx.GetLogger().Error(err)
  360. break
  361. }
  362. tempLines[slot] = scanner.Text()
  363. }
  364. } else {
  365. _, err = w.Write(scanner.Bytes())
  366. if err != nil {
  367. ctx.GetLogger().Error(err)
  368. break
  369. }
  370. _, err = w.Write([]byte{'\n'})
  371. if err != nil {
  372. ctx.GetLogger().Error(err)
  373. break
  374. }
  375. }
  376. }
  377. ln++
  378. }
  379. }()
  380. return r, nil
  381. }
  382. return f, nil
  383. }