ruleState.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508
  1. // Copyright 2022-2023 EMQ Technologies Co., Ltd.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package rule
  15. import (
  16. "context"
  17. "fmt"
  18. "math"
  19. "math/rand"
  20. "strings"
  21. "sync"
  22. "time"
  23. "github.com/robfig/cron/v3"
  24. "github.com/lf-edge/ekuiper/internal/conf"
  25. "github.com/lf-edge/ekuiper/internal/topo"
  26. "github.com/lf-edge/ekuiper/internal/topo/planner"
  27. "github.com/lf-edge/ekuiper/pkg/api"
  28. "github.com/lf-edge/ekuiper/pkg/infra"
  29. "github.com/lf-edge/ekuiper/pkg/schedule"
  30. )
  31. type ActionSignal int
  32. const (
  33. ActionSignalStart ActionSignal = iota
  34. ActionSignalStop
  35. )
  36. type cronInterface interface {
  37. Start()
  38. AddFunc(spec string, cmd func()) (cron.EntryID, error)
  39. Remove(id cron.EntryID)
  40. }
  41. var backgroundCron cronInterface
  42. func init() {
  43. if !conf.IsTesting {
  44. backgroundCron = cron.New()
  45. } else {
  46. backgroundCron = &MockCron{}
  47. }
  48. backgroundCron.Start()
  49. }
  50. type cronStateCtx struct {
  51. cancel context.CancelFunc
  52. entryID cron.EntryID
  53. // isInSchedule indicates the current rule is in scheduled in backgroundCron
  54. isInSchedule bool
  55. startFailedCnt int
  56. // only used for test
  57. cron string
  58. duration string
  59. }
  60. /*********
  61. * RuleState is created for each rule. Each ruleState runs two loops:
  62. * 1. action event loop to accept commands, such as start, stop, getStatus, delete
  63. * 2. topo running loop
  64. * Both loops need to access the status, so lock is needed
  65. */
  66. type RuleState struct {
  67. // Constant, never change. Channel to send signals to manage connection retry. When deleting the rule, close it.
  68. RuleId string
  69. ActionCh chan ActionSignal
  70. // Nearly constant, only change when update the rule
  71. Rule *api.Rule
  72. // States, create through rule in each rule start
  73. Topology *topo.Topo
  74. // 0 stop, 1 start, -1 delete, changed in actions
  75. triggered int
  76. // temporary storage for topo graph to make sure even rule close, the graph is still available
  77. topoGraph *api.PrintableTopo
  78. sync.RWMutex
  79. cronState cronStateCtx
  80. }
  81. // NewRuleState Create and initialize a rule state.
  82. // Errors are possible during plan the topo.
  83. // If error happens return immediately without add it to the registry
  84. func NewRuleState(rule *api.Rule) (*RuleState, error) {
  85. rs := &RuleState{
  86. RuleId: rule.Id,
  87. Rule: rule,
  88. ActionCh: make(chan ActionSignal),
  89. }
  90. rs.run()
  91. if tp, err := planner.Plan(rule); err != nil {
  92. return rs, err
  93. } else {
  94. rs.Topology = tp
  95. return rs, nil
  96. }
  97. }
  98. // UpdateTopo update the rule and the topology AND restart the topology
  99. // Do not need to call restart after update
  100. func (rs *RuleState) UpdateTopo(rule *api.Rule) error {
  101. if _, err := planner.Plan(rule); err != nil {
  102. return err
  103. }
  104. if err := rs.Stop(); err != nil {
  105. return err
  106. }
  107. time.Sleep(1 * time.Millisecond)
  108. rs.Rule = rule
  109. return rs.Start()
  110. }
  111. // Run start to run the two loops, do not access any changeable states
  112. func (rs *RuleState) run() {
  113. var (
  114. ctx context.Context
  115. cancel context.CancelFunc
  116. )
  117. // action loop, once start never end until the rule is deleted
  118. go func() {
  119. conf.Log.Infof("Start rulestate %s", rs.RuleId)
  120. for {
  121. s, opened := <-rs.ActionCh
  122. if !opened {
  123. conf.Log.Infof("Stop rulestate %s", rs.RuleId)
  124. if cancel != nil {
  125. cancel()
  126. }
  127. return
  128. }
  129. switch s {
  130. case ActionSignalStart:
  131. if ctx != nil {
  132. conf.Log.Warnf("rule %s is already started", rs.RuleId)
  133. } else {
  134. ctx, cancel = context.WithCancel(context.Background())
  135. go rs.runTopo(ctx)
  136. }
  137. case ActionSignalStop:
  138. // Stop the running loop
  139. if cancel != nil {
  140. cancel()
  141. ctx = nil
  142. cancel = nil
  143. } else {
  144. conf.Log.Warnf("rule %s is already stopped", rs.RuleId)
  145. }
  146. }
  147. }
  148. }()
  149. }
  150. func (rs *RuleState) runTopo(ctx context.Context) {
  151. // Load the changeable states once
  152. rs.Lock()
  153. tp := rs.Topology
  154. option := rs.Rule.Options.Restart
  155. rs.Unlock()
  156. if tp == nil {
  157. conf.Log.Warnf("rule %s is not initialized or just stopped", rs.RuleId)
  158. return
  159. }
  160. err := infra.SafeRun(func() error {
  161. count := 0
  162. d := option.Delay
  163. var er error
  164. ticker := time.NewTicker(time.Duration(d) * time.Millisecond)
  165. defer ticker.Stop()
  166. for {
  167. select {
  168. case e := <-tp.Open():
  169. er = e
  170. if er != nil { // Only restart rule for errors
  171. tp.GetContext().SetError(er)
  172. conf.Log.Errorf("closing rule %s for error: %v", rs.RuleId, er)
  173. tp.Cancel()
  174. } else { // exit normally
  175. return nil
  176. }
  177. }
  178. if count < option.Attempts {
  179. if d > option.MaxDelay {
  180. d = option.MaxDelay
  181. }
  182. if option.JitterFactor > 0 {
  183. d = int(math.Round(float64(d) * ((rand.Float64()*2-1)*0.1 + 1)))
  184. conf.Log.Infof("Rule %s will restart with jitterred delay %d", rs.RuleId, d)
  185. } else {
  186. conf.Log.Infof("Rule %s will restart with delay %d", rs.RuleId, d)
  187. }
  188. // retry after delay
  189. select {
  190. case <-ticker.C:
  191. break
  192. case <-ctx.Done():
  193. conf.Log.Errorf("stop rule %s retry as cancelled", rs.RuleId)
  194. return nil
  195. }
  196. count++
  197. if option.Multiplier > 0 {
  198. d = option.Delay * int(math.Pow(option.Multiplier, float64(count)))
  199. }
  200. } else {
  201. return er
  202. }
  203. }
  204. })
  205. if err != nil { // Exit after retries
  206. rs.Lock()
  207. // The only change the state by error
  208. if rs.triggered != -1 {
  209. rs.triggered = 0
  210. if rs.Topology != nil {
  211. rs.topoGraph = rs.Topology.GetTopo()
  212. }
  213. rs.ActionCh <- ActionSignalStop
  214. }
  215. rs.Unlock()
  216. }
  217. }
  218. // The action functions are state machine.
  219. func (rs *RuleState) Start() error {
  220. rs.Lock()
  221. defer rs.Unlock()
  222. if rs.triggered == -1 {
  223. return fmt.Errorf("rule %s is already deleted", rs.RuleId)
  224. }
  225. if rs.Rule.IsLongRunningScheduleRule() {
  226. isIn, err := schedule.IsInScheduleRanges(conf.GetNow(), rs.Rule.Options.CronDatetimeRange)
  227. if err != nil {
  228. return err
  229. }
  230. // When rule is created, we need to check its schedule range before start it.
  231. if !isIn {
  232. return nil
  233. }
  234. }
  235. if rs.Rule.IsScheduleRule() {
  236. return rs.startScheduleRule()
  237. }
  238. return rs.start()
  239. }
  240. // startScheduleRule will register the job in the backgroundCron to run.
  241. // Job will do following 2 things:
  242. // 1. start the rule in cron if else the job is already stopped
  243. // 2. after the rule started, start an extract goroutine to stop the rule after specific duration
  244. func (rs *RuleState) startScheduleRule() error {
  245. if rs.cronState.isInSchedule {
  246. return fmt.Errorf("rule %s is already in schedule", rs.RuleId)
  247. }
  248. d, err := time.ParseDuration(rs.Rule.Options.Duration)
  249. if err != nil {
  250. return err
  251. }
  252. var cronCtx context.Context
  253. cronCtx, rs.cronState.cancel = context.WithCancel(context.Background())
  254. now := conf.GetNow()
  255. isInRunningSchedule, remainedDuration, err := rs.isInRunningSchedule(now, d)
  256. if err != nil {
  257. return err
  258. }
  259. if isInRunningSchedule {
  260. if err := rs.runScheduleRule(); err != nil {
  261. return err
  262. }
  263. rs.stopAfterDuration(remainedDuration, cronCtx)
  264. }
  265. entryID, err := backgroundCron.AddFunc(rs.Rule.Options.Cron, func() {
  266. var started bool
  267. var err error
  268. if started, err = func() (bool, error) {
  269. switch backgroundCron.(type) {
  270. case *MockCron:
  271. // skip mutex if this is a unit test
  272. default:
  273. rs.Lock()
  274. defer rs.Unlock()
  275. }
  276. now := conf.GetNow()
  277. allowed, err := rs.isInAllowedTimeRange(now)
  278. if err != nil {
  279. return false, err
  280. }
  281. if !allowed {
  282. return false, nil
  283. }
  284. rs.cronState.cron = rs.Rule.Options.Cron
  285. rs.cronState.duration = rs.Rule.Options.Duration
  286. return true, rs.start()
  287. }(); err != nil {
  288. rs.Lock()
  289. rs.cronState.startFailedCnt++
  290. rs.Unlock()
  291. conf.Log.Errorf(err.Error())
  292. return
  293. }
  294. if started {
  295. rs.stopAfterDuration(d, cronCtx)
  296. }
  297. })
  298. if err != nil {
  299. return err
  300. }
  301. rs.cronState.isInSchedule = true
  302. rs.cronState.entryID = entryID
  303. return nil
  304. }
  305. func (rs *RuleState) runScheduleRule() error {
  306. rs.Lock()
  307. defer rs.Unlock()
  308. rs.cronState.cron = rs.Rule.Options.Cron
  309. rs.cronState.duration = rs.Rule.Options.Duration
  310. err := rs.start()
  311. if err != nil {
  312. return err
  313. }
  314. return nil
  315. }
  316. func (rs *RuleState) stopAfterDuration(d time.Duration, cronCtx context.Context) {
  317. after := time.After(d)
  318. go func(ctx context.Context) {
  319. select {
  320. case <-after:
  321. rs.Lock()
  322. defer rs.Unlock()
  323. if err := rs.stop(); err != nil {
  324. conf.Log.Errorf("close rule %s failed, err: %v", rs.RuleId, err)
  325. }
  326. return
  327. case <-cronCtx.Done():
  328. return
  329. }
  330. }(cronCtx)
  331. }
  332. func (rs *RuleState) start() error {
  333. if rs.triggered != 1 {
  334. // If the rule has been stopped due to error, the topology is not nil
  335. if rs.Topology != nil {
  336. rs.Topology.Cancel()
  337. }
  338. if tp, err := planner.Plan(rs.Rule); err != nil {
  339. return err
  340. } else {
  341. rs.Topology = tp
  342. }
  343. rs.triggered = 1
  344. }
  345. rs.ActionCh <- ActionSignalStart
  346. return nil
  347. }
  348. // Stop remove the Topology
  349. func (rs *RuleState) Stop() error {
  350. rs.Lock()
  351. defer rs.Unlock()
  352. rs.stopScheduleRule()
  353. return rs.stop()
  354. }
  355. func (rs *RuleState) stopScheduleRule() {
  356. if rs.Rule.IsScheduleRule() && rs.cronState.isInSchedule {
  357. rs.cronState.isInSchedule = false
  358. if rs.cronState.cancel != nil {
  359. rs.cronState.cancel()
  360. }
  361. rs.cronState.startFailedCnt = 0
  362. backgroundCron.Remove(rs.cronState.entryID)
  363. }
  364. }
  365. func (rs *RuleState) stop() error {
  366. if rs.triggered == -1 {
  367. return fmt.Errorf("rule %s is already deleted", rs.RuleId)
  368. }
  369. rs.triggered = 0
  370. if rs.Topology != nil {
  371. rs.Topology.Cancel()
  372. }
  373. rs.ActionCh <- ActionSignalStop
  374. return nil
  375. }
  376. func (rs *RuleState) Close() error {
  377. rs.Lock()
  378. defer rs.Unlock()
  379. if rs.Topology != nil {
  380. rs.Topology.RemoveMetrics()
  381. }
  382. if rs.triggered == 1 && rs.Topology != nil {
  383. rs.Topology.Cancel()
  384. }
  385. rs.triggered = -1
  386. rs.stopScheduleRule()
  387. close(rs.ActionCh)
  388. return nil
  389. }
  390. func (rs *RuleState) GetState() (string, error) {
  391. rs.RLock()
  392. defer rs.RUnlock()
  393. result := ""
  394. if rs.Topology == nil {
  395. result = "Stopped: fail to create the topo."
  396. } else {
  397. c := (*rs.Topology).GetContext()
  398. if c != nil {
  399. err := c.Err()
  400. switch err {
  401. case nil:
  402. result = "Running"
  403. case context.Canceled:
  404. if rs.Rule.IsScheduleRule() && rs.cronState.isInSchedule {
  405. if schedule.IsAfterTimeRanges(conf.GetNow(), rs.Rule.Options.CronDatetimeRange) {
  406. result = "Stopped: schedule terminated."
  407. } else {
  408. result = "Stopped: waiting for next schedule."
  409. }
  410. } else {
  411. result = "Stopped: canceled manually."
  412. }
  413. case context.DeadlineExceeded:
  414. result = "Stopped: deadline exceed."
  415. default:
  416. result = fmt.Sprintf("Stopped: %v.", err)
  417. }
  418. } else {
  419. if rs.cronState.isInSchedule {
  420. if schedule.IsAfterTimeRanges(conf.GetNow(), rs.Rule.Options.CronDatetimeRange) {
  421. result = "Stopped: schedule terminated."
  422. } else {
  423. result = "Stopped: waiting for next schedule."
  424. }
  425. } else {
  426. result = "Stopped: canceled manually."
  427. }
  428. }
  429. }
  430. if rs.Rule.IsScheduleRule() && rs.cronState.startFailedCnt > 0 {
  431. result = result + fmt.Sprintf(" Start failed count: %v.", rs.cronState.startFailedCnt)
  432. }
  433. return result, nil
  434. }
  435. func (rs *RuleState) GetTopoGraph() *api.PrintableTopo {
  436. rs.RLock()
  437. defer rs.RUnlock()
  438. if rs.topoGraph != nil {
  439. return rs.topoGraph
  440. } else if rs.Topology != nil {
  441. return rs.Topology.GetTopo()
  442. } else {
  443. return nil
  444. }
  445. }
  446. func (rs *RuleState) isInRunningSchedule(now time.Time, d time.Duration) (bool, time.Duration, error) {
  447. allowed, err := rs.isInAllowedTimeRange(now)
  448. if err != nil {
  449. return false, 0, err
  450. }
  451. if !allowed {
  452. return false, 0, nil
  453. }
  454. cronExpr := rs.Rule.Options.Cron
  455. if strings.HasPrefix(cronExpr, "mock") {
  456. return false, 0, nil
  457. }
  458. return schedule.IsInRunningSchedule(cronExpr, now, d)
  459. }
  460. func (rs *RuleState) isInAllowedTimeRange(now time.Time) (bool, error) {
  461. allowed := true
  462. var err error
  463. for _, timeRange := range rs.Rule.Options.CronDatetimeRange {
  464. allowed, err = schedule.IsInScheduleRange(now, timeRange.Begin, timeRange.End)
  465. if err != nil {
  466. return false, err
  467. }
  468. if allowed {
  469. break
  470. }
  471. }
  472. return allowed, nil
  473. }