lexical.go 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429
  1. package xsql
  2. import (
  3. "bufio"
  4. "bytes"
  5. "github.com/emqx/kuiper/pkg/ast"
  6. "io"
  7. "strconv"
  8. "strings"
  9. )
  10. type Scanner struct {
  11. r *bufio.Reader
  12. }
  13. func NewScanner(r io.Reader) *Scanner {
  14. return &Scanner{r: bufio.NewReader(r)}
  15. }
  16. func (s *Scanner) Scan() (tok ast.Token, lit string) {
  17. ch := s.read()
  18. if isWhiteSpace(ch) {
  19. //s.unread()
  20. return s.ScanWhiteSpace()
  21. } else if isLetter(ch) {
  22. s.unread()
  23. return s.ScanIdent()
  24. } else if isQuotation(ch) {
  25. s.unread()
  26. return s.ScanString()
  27. } else if isDigit(ch) {
  28. s.unread()
  29. return s.ScanNumber(false, false)
  30. } else if isBackquote(ch) {
  31. return s.ScanBackquoteIdent()
  32. }
  33. switch ch {
  34. case eof:
  35. return ast.EOF, ast.Tokens[ast.EOF]
  36. case '=':
  37. return ast.EQ, ast.Tokens[ast.EQ]
  38. case '!':
  39. _, _ = s.ScanWhiteSpace()
  40. if r := s.read(); r == '=' {
  41. return ast.NEQ, ast.Tokens[ast.NEQ]
  42. } else {
  43. s.unread()
  44. }
  45. return ast.EQ, ast.Tokens[ast.EQ]
  46. case '<':
  47. _, _ = s.ScanWhiteSpace()
  48. if r := s.read(); r == '=' {
  49. return ast.LTE, ast.Tokens[ast.LTE]
  50. } else {
  51. s.unread()
  52. }
  53. return ast.LT, ast.Tokens[ast.LT]
  54. case '>':
  55. _, _ = s.ScanWhiteSpace()
  56. if r := s.read(); r == '=' {
  57. return ast.GTE, ast.Tokens[ast.GTE]
  58. } else {
  59. s.unread()
  60. }
  61. return ast.GT, ast.Tokens[ast.GT]
  62. case '+':
  63. return ast.ADD, ast.Tokens[ast.ADD]
  64. case '-':
  65. _, _ = s.ScanWhiteSpace()
  66. if r := s.read(); r == '-' {
  67. s.skipUntilNewline()
  68. return ast.COMMENT, ""
  69. } else if r == '>' {
  70. return ast.ARROW, ast.Tokens[ast.ARROW]
  71. } else if r == '.' {
  72. _, _ = s.ScanWhiteSpace()
  73. if r1 := s.read(); isDigit(r1) {
  74. s.unread()
  75. return s.ScanNumber(true, true)
  76. } else {
  77. s.unread()
  78. }
  79. s.unread()
  80. } else {
  81. s.unread()
  82. }
  83. return ast.SUB, ast.Tokens[ast.SUB]
  84. case '/':
  85. _, _ = s.ScanWhiteSpace()
  86. if r := s.read(); r == '*' {
  87. if err := s.skipUntilEndComment(); err != nil {
  88. return ast.ILLEGAL, ""
  89. }
  90. return ast.COMMENT, ""
  91. } else {
  92. s.unread()
  93. }
  94. return ast.DIV, ast.Tokens[ast.DIV]
  95. case '.':
  96. if r := s.read(); isDigit(r) {
  97. s.unread()
  98. return s.ScanNumber(true, false)
  99. }
  100. s.unread()
  101. return ast.DOT, ast.Tokens[ast.DOT]
  102. case '%':
  103. return ast.MOD, ast.Tokens[ast.MOD]
  104. case '&':
  105. return ast.BITWISE_AND, ast.Tokens[ast.BITWISE_AND]
  106. case '|':
  107. return ast.BITWISE_OR, ast.Tokens[ast.BITWISE_OR]
  108. case '^':
  109. return ast.BITWISE_XOR, ast.Tokens[ast.BITWISE_XOR]
  110. case '*':
  111. return ast.ASTERISK, ast.Tokens[ast.ASTERISK]
  112. case ',':
  113. return ast.COMMA, ast.Tokens[ast.COMMA]
  114. case '(':
  115. return ast.LPAREN, ast.Tokens[ast.LPAREN]
  116. case ')':
  117. return ast.RPAREN, ast.Tokens[ast.RPAREN]
  118. case '[':
  119. return ast.LBRACKET, ast.Tokens[ast.LBRACKET]
  120. case ']':
  121. return ast.RBRACKET, ast.Tokens[ast.RBRACKET]
  122. case ':':
  123. return ast.COLON, ast.Tokens[ast.COLON]
  124. case '#':
  125. return ast.HASH, ast.Tokens[ast.HASH]
  126. case ';':
  127. return ast.SEMICOLON, ast.Tokens[ast.SEMICOLON]
  128. }
  129. return ast.ILLEGAL, ""
  130. }
  131. func (s *Scanner) ScanIdent() (tok ast.Token, lit string) {
  132. var buf bytes.Buffer
  133. buf.WriteRune(s.read())
  134. for {
  135. if ch := s.read(); ch == eof {
  136. break
  137. } else if !isLetter(ch) && !isDigit(ch) && ch != '_' {
  138. s.unread()
  139. break
  140. } else {
  141. buf.WriteRune(ch)
  142. }
  143. }
  144. switch lit = strings.ToUpper(buf.String()); lit {
  145. case "SELECT":
  146. return ast.SELECT, lit
  147. case "AS":
  148. return ast.AS, lit
  149. case "FROM":
  150. return ast.FROM, lit
  151. case "WHERE":
  152. return ast.WHERE, lit
  153. case "AND":
  154. return ast.AND, lit
  155. case "OR":
  156. return ast.OR, lit
  157. case "GROUP":
  158. return ast.GROUP, lit
  159. case "HAVING":
  160. return ast.HAVING, lit
  161. case "ORDER":
  162. return ast.ORDER, lit
  163. case "BY":
  164. return ast.BY, lit
  165. case "DESC":
  166. return ast.DESC, lit
  167. case "ASC":
  168. return ast.ASC, lit
  169. case "FILTER":
  170. return ast.FILTER, lit
  171. case "INNER":
  172. return ast.INNER, lit
  173. case "LEFT":
  174. return ast.LEFT, lit
  175. case "RIGHT":
  176. return ast.RIGHT, lit
  177. case "FULL":
  178. return ast.FULL, lit
  179. case "CROSS":
  180. return ast.CROSS, lit
  181. case "JOIN":
  182. return ast.JOIN, lit
  183. case "ON":
  184. return ast.ON, lit
  185. case "CASE":
  186. return ast.CASE, lit
  187. case "WHEN":
  188. return ast.WHEN, lit
  189. case "THEN":
  190. return ast.THEN, lit
  191. case "ELSE":
  192. return ast.ELSE, lit
  193. case "END":
  194. return ast.END, lit
  195. case "CREATE":
  196. return ast.CREATE, lit
  197. case "DROP":
  198. return ast.DROP, lit
  199. case "EXPLAIN":
  200. return ast.EXPLAIN, lit
  201. case "DESCRIBE":
  202. return ast.DESCRIBE, lit
  203. case "SHOW":
  204. return ast.SHOW, lit
  205. case "STREAM":
  206. return ast.STREAM, lit
  207. case "STREAMS":
  208. return ast.STREAMS, lit
  209. case "TABLE":
  210. return ast.TABLE, lit
  211. case "TABLES":
  212. return ast.TABLES, lit
  213. case "WITH":
  214. return ast.WITH, lit
  215. case "BIGINT":
  216. return ast.XBIGINT, lit
  217. case "FLOAT":
  218. return ast.XFLOAT, lit
  219. case "DATETIME":
  220. return ast.XDATETIME, lit
  221. case "STRING":
  222. return ast.XSTRING, lit
  223. case "BYTEA":
  224. return ast.XBYTEA, lit
  225. case "BOOLEAN":
  226. return ast.XBOOLEAN, lit
  227. case "ARRAY":
  228. return ast.XARRAY, lit
  229. case "STRUCT":
  230. return ast.XSTRUCT, lit
  231. case "DATASOURCE":
  232. return ast.DATASOURCE, lit
  233. case "KEY":
  234. return ast.KEY, lit
  235. case "FORMAT":
  236. return ast.FORMAT, lit
  237. case "CONF_KEY":
  238. return ast.CONF_KEY, lit
  239. case "TYPE":
  240. return ast.TYPE, lit
  241. case "TRUE":
  242. return ast.TRUE, lit
  243. case "FALSE":
  244. return ast.FALSE, lit
  245. case "STRICT_VALIDATION":
  246. return ast.STRICT_VALIDATION, lit
  247. case "TIMESTAMP":
  248. return ast.TIMESTAMP, lit
  249. case "TIMESTAMP_FORMAT":
  250. return ast.TIMESTAMP_FORMAT, lit
  251. case "RETAIN_SIZE":
  252. return ast.RETAIN_SIZE, lit
  253. case "SHARED":
  254. return ast.SHARED, lit
  255. case "DD":
  256. return ast.DD, lit
  257. case "HH":
  258. return ast.HH, lit
  259. case "MI":
  260. return ast.MI, lit
  261. case "SS":
  262. return ast.SS, lit
  263. case "MS":
  264. return ast.MS, lit
  265. }
  266. return ast.IDENT, buf.String()
  267. }
  268. func (s *Scanner) ScanString() (tok ast.Token, lit string) {
  269. var buf bytes.Buffer
  270. ch := s.read()
  271. buf.WriteRune(ch)
  272. escape := false
  273. for {
  274. ch = s.read()
  275. if ch == '"' && !escape {
  276. buf.WriteRune(ch)
  277. break
  278. } else if ch == eof {
  279. return ast.BADSTRING, buf.String()
  280. } else if ch == '\\' && !escape {
  281. escape = true
  282. buf.WriteRune(ch)
  283. } else {
  284. escape = false
  285. buf.WriteRune(ch)
  286. }
  287. }
  288. r, _ := strconv.Unquote(buf.String())
  289. return ast.STRING, r
  290. }
  291. func (s *Scanner) ScanDigit() (tok ast.Token, lit string) {
  292. var buf bytes.Buffer
  293. ch := s.read()
  294. buf.WriteRune(ch)
  295. for {
  296. if ch := s.read(); isDigit(ch) {
  297. buf.WriteRune(ch)
  298. } else {
  299. s.unread()
  300. break
  301. }
  302. }
  303. return ast.INTEGER, buf.String()
  304. }
  305. func (s *Scanner) ScanNumber(startWithDot bool, isNeg bool) (tok ast.Token, lit string) {
  306. var buf bytes.Buffer
  307. if isNeg {
  308. buf.WriteRune('-')
  309. }
  310. if startWithDot {
  311. buf.WriteRune('.')
  312. }
  313. ch := s.read()
  314. buf.WriteRune(ch)
  315. isNum := false
  316. for {
  317. if ch := s.read(); isDigit(ch) {
  318. buf.WriteRune(ch)
  319. } else if ch == '.' {
  320. isNum = true
  321. buf.WriteRune(ch)
  322. } else {
  323. s.unread()
  324. break
  325. }
  326. }
  327. if isNum || startWithDot {
  328. return ast.NUMBER, buf.String()
  329. } else {
  330. return ast.INTEGER, buf.String()
  331. }
  332. }
  333. func (s *Scanner) ScanBackquoteIdent() (tok ast.Token, lit string) {
  334. var buf bytes.Buffer
  335. for {
  336. ch := s.read()
  337. if isBackquote(ch) || ch == eof {
  338. break
  339. } else {
  340. buf.WriteRune(ch)
  341. }
  342. }
  343. return ast.IDENT, buf.String()
  344. }
  345. func (s *Scanner) skipUntilNewline() {
  346. for {
  347. if ch := s.read(); ch == '\n' || ch == eof {
  348. return
  349. }
  350. }
  351. }
  352. func (s *Scanner) skipUntilEndComment() error {
  353. for {
  354. if ch1 := s.read(); ch1 == '*' {
  355. // We might be at the end.
  356. star:
  357. ch2 := s.read()
  358. if ch2 == '/' {
  359. return nil
  360. } else if ch2 == '*' {
  361. // We are back in the state machine since we see a star.
  362. goto star
  363. } else if ch2 == eof {
  364. return io.EOF
  365. }
  366. } else if ch1 == eof {
  367. return io.EOF
  368. }
  369. }
  370. }
  371. func (s *Scanner) ScanWhiteSpace() (tok ast.Token, lit string) {
  372. var buf bytes.Buffer
  373. for {
  374. if ch := s.read(); ch == eof {
  375. break
  376. } else if !isWhiteSpace(ch) {
  377. s.unread()
  378. break
  379. } else {
  380. buf.WriteRune(ch)
  381. }
  382. }
  383. return ast.WS, buf.String()
  384. }
  385. func (s *Scanner) read() rune {
  386. ch, _, err := s.r.ReadRune()
  387. if err != nil {
  388. return eof
  389. }
  390. return ch
  391. }
  392. func (s *Scanner) unread() {
  393. _ = s.r.UnreadRune()
  394. }
  395. var eof = rune(0)
  396. func isWhiteSpace(r rune) bool {
  397. return (r == ' ') || (r == '\t') || (r == '\r') || (r == '\n')
  398. }
  399. func isLetter(ch rune) bool { return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') }
  400. func isDigit(ch rune) bool { return ch >= '0' && ch <= '9' }
  401. func isQuotation(ch rune) bool { return ch == '"' }
  402. func isBackquote(ch rune) bool { return ch == '`' }