lexical.go 12 KB


  1. package xsql
  2. import (
  3. "bufio"
  4. "bytes"
  5. "io"
  6. "strconv"
  7. "strings"
  8. )
  9. type Token int
  10. const (
  11. // Special tokens
  12. ILLEGAL Token = iota
  13. EOF
  14. WS
  15. COMMENT
  16. AS
  17. // Literals
  18. IDENT // main
  19. INTEGER // 12345
  20. NUMBER //12345.67
  21. STRING // "abc"
  22. BADSTRING // "abc
  23. operatorBeg
  24. // ADD and the following are InfluxQL Operators
  25. ADD // +
  26. SUB // -
  27. MUL // *
  28. DIV // /
  29. MOD // %
  30. BITWISE_AND // &
  31. BITWISE_OR // |
  32. BITWISE_XOR // ^
  33. AND // AND
  34. OR // OR
  35. EQ // =
  36. NEQ // !=
  37. LT // <
  38. LTE // <=
  39. GT // >
  40. GTE // >=
  41. SUBSET //[
  42. ARROW //->
  43. operatorEnd
  44. // Misc characters
  45. ASTERISK // *
  46. COMMA // ,
  47. LPAREN // (
  48. RPAREN // )
  49. LBRACKET //[
  50. RBRACKET //]
  51. HASH // #
  52. DOT // .
  53. COLON //:
  54. SEMICOLON //;
  55. COLSEP //\007
  56. // Keywords
  57. SELECT
  58. FROM
  59. JOIN
  60. INNER
  61. LEFT
  62. RIGHT
  63. FULL
  64. CROSS
  65. ON
  66. WHERE
  67. GROUP
  68. ORDER
  69. HAVING
  70. BY
  71. ASC
  72. DESC
  73. FILTER
  74. CASE
  75. WHEN
  76. THEN
  77. ELSE
  78. END
  79. TRUE
  80. FALSE
  81. CREATE
  82. DROP
  83. EXPLAIN
  84. DESCRIBE
  85. SHOW
  86. STREAM
  87. TABLE
  88. STREAMS
  89. TABLES
  90. WITH
  91. XBIGINT
  92. XFLOAT
  93. XSTRING
  94. XBYTEA
  95. XDATETIME
  96. XBOOLEAN
  97. XARRAY
  98. XSTRUCT
  99. DATASOURCE
  100. KEY
  101. FORMAT
  102. CONF_KEY
  103. TYPE
  104. STRICT_VALIDATION
  105. TIMESTAMP
  106. TIMESTAMP_FORMAT
  107. RETAIN_SIZE
  108. DD
  109. HH
  110. MI
  111. SS
  112. MS
  113. )
  114. var tokens = []string{
  115. ILLEGAL: "ILLEGAL",
  116. EOF: "EOF",
  117. AS: "AS",
  118. WS: "WS",
  119. IDENT: "IDENT",
  120. INTEGER: "INTEGER",
  121. NUMBER: "NUMBER",
  122. STRING: "STRING",
  123. ADD: "+",
  124. SUB: "-",
  125. MUL: "*",
  126. DIV: "/",
  127. MOD: "%",
  128. BITWISE_AND: "&",
  129. BITWISE_OR: "|",
  130. BITWISE_XOR: "^",
  131. EQ: "=",
  132. NEQ: "!=",
  133. LT: "<",
  134. LTE: "<=",
  135. GT: ">",
  136. GTE: ">=",
  137. SUBSET: "[]",
  138. ARROW: "->",
  139. ASTERISK: "*",
  140. COMMA: ",",
  141. LPAREN: "(",
  142. RPAREN: ")",
  143. LBRACKET: "[",
  144. RBRACKET: "]",
  145. HASH: "#",
  146. DOT: ".",
  147. SEMICOLON: ";",
  148. COLON: ":",
  149. COLSEP: "\007",
  150. SELECT: "SELECT",
  151. FROM: "FROM",
  152. JOIN: "JOIN",
  153. LEFT: "LEFT",
  154. INNER: "INNER",
  155. ON: "ON",
  156. WHERE: "WHERE",
  157. GROUP: "GROUP",
  158. ORDER: "ORDER",
  159. HAVING: "HAVING",
  160. BY: "BY",
  161. ASC: "ASC",
  162. DESC: "DESC",
  163. CREATE: "CREATE",
  164. DROP: "RROP",
  165. EXPLAIN: "EXPLAIN",
  166. DESCRIBE: "DESCRIBE",
  167. SHOW: "SHOW",
  168. STREAM: "STREAM",
  169. TABLE: "TABLE",
  170. STREAMS: "STREAMS",
  171. TABLES: "TABLES",
  172. WITH: "WITH",
  173. XBIGINT: "BIGINT",
  174. XFLOAT: "FLOAT",
  175. XSTRING: "STRING",
  176. XBYTEA: "BYTEA",
  177. XDATETIME: "DATETIME",
  178. XBOOLEAN: "BOOLEAN",
  179. XARRAY: "ARRAY",
  180. XSTRUCT: "STRUCT",
  181. DATASOURCE: "DATASOURCE",
  182. KEY: "KEY",
  183. FORMAT: "FORMAT",
  184. CONF_KEY: "CONF_KEY",
  185. TYPE: "TYPE",
  186. STRICT_VALIDATION: "STRICT_VALIDATION",
  187. TIMESTAMP: "TIMESTAMP",
  188. TIMESTAMP_FORMAT: "TIMESTAMP_FORMAT",
  189. RETAIN_SIZE: "RETAIN_SIZE",
  190. AND: "AND",
  191. OR: "OR",
  192. TRUE: "TRUE",
  193. FALSE: "FALSE",
  194. DD: "DD",
  195. HH: "HH",
  196. MI: "MI",
  197. SS: "SS",
  198. MS: "MS",
  199. }
  200. func (tok Token) String() string {
  201. if tok >= 0 && tok < Token(len(tokens)) {
  202. return tokens[tok]
  203. }
  204. return ""
  205. }
  206. type Scanner struct {
  207. r *bufio.Reader
  208. }
  209. func NewScanner(r io.Reader) *Scanner {
  210. return &Scanner{r: bufio.NewReader(r)}
  211. }
  212. func (s *Scanner) Scan() (tok Token, lit string) {
  213. ch := s.read()
  214. if isWhiteSpace(ch) {
  215. //s.unread()
  216. return s.ScanWhiteSpace()
  217. } else if isLetter(ch) {
  218. s.unread()
  219. return s.ScanIdent()
  220. } else if isQuotation(ch) {
  221. s.unread()
  222. return s.ScanString()
  223. } else if isDigit(ch) {
  224. s.unread()
  225. return s.ScanNumber(false, false)
  226. } else if isBackquote(ch) {
  227. return s.ScanBackquoteIdent()
  228. }
  229. switch ch {
  230. case eof:
  231. return EOF, tokens[EOF]
  232. case '=':
  233. return EQ, tokens[EQ]
  234. case '!':
  235. _, _ = s.ScanWhiteSpace()
  236. if r := s.read(); r == '=' {
  237. return NEQ, tokens[NEQ]
  238. } else {
  239. s.unread()
  240. }
  241. return EQ, tokens[EQ]
  242. case '<':
  243. _, _ = s.ScanWhiteSpace()
  244. if r := s.read(); r == '=' {
  245. return LTE, tokens[LTE]
  246. } else {
  247. s.unread()
  248. }
  249. return LT, tokens[LT]
  250. case '>':
  251. _, _ = s.ScanWhiteSpace()
  252. if r := s.read(); r == '=' {
  253. return GTE, tokens[GTE]
  254. } else {
  255. s.unread()
  256. }
  257. return GT, tokens[GT]
  258. case '+':
  259. return ADD, tokens[ADD]
  260. case '-':
  261. _, _ = s.ScanWhiteSpace()
  262. if r := s.read(); r == '-' {
  263. s.skipUntilNewline()
  264. return COMMENT, ""
  265. } else if r == '>' {
  266. return ARROW, tokens[ARROW]
  267. } else if isDigit(r) {
  268. s.unread()
  269. return s.ScanNumber(false, true)
  270. } else if r == '.' {
  271. _, _ = s.ScanWhiteSpace()
  272. if r1 := s.read(); isDigit(r1) {
  273. s.unread()
  274. return s.ScanNumber(true, true)
  275. } else {
  276. s.unread()
  277. }
  278. s.unread()
  279. } else {
  280. s.unread()
  281. }
  282. return SUB, tokens[SUB]
  283. case '/':
  284. _, _ = s.ScanWhiteSpace()
  285. if r := s.read(); r == '*' {
  286. if err := s.skipUntilEndComment(); err != nil {
  287. return ILLEGAL, ""
  288. }
  289. return COMMENT, ""
  290. } else {
  291. s.unread()
  292. }
  293. return DIV, tokens[DIV]
  294. case '.':
  295. if r := s.read(); isDigit(r) {
  296. s.unread()
  297. return s.ScanNumber(true, false)
  298. }
  299. s.unread()
  300. return DOT, tokens[DOT]
  301. case '%':
  302. return MOD, tokens[MOD]
  303. case '&':
  304. return BITWISE_AND, tokens[BITWISE_AND]
  305. case '|':
  306. return BITWISE_OR, tokens[BITWISE_OR]
  307. case '^':
  308. return BITWISE_XOR, tokens[BITWISE_XOR]
  309. case '*':
  310. return ASTERISK, tokens[ASTERISK]
  311. case ',':
  312. return COMMA, tokens[COMMA]
  313. case '(':
  314. return LPAREN, tokens[LPAREN]
  315. case ')':
  316. return RPAREN, tokens[RPAREN]
  317. case '[':
  318. return LBRACKET, tokens[LBRACKET]
  319. case ']':
  320. return RBRACKET, tokens[RBRACKET]
  321. case ':':
  322. return COLON, tokens[COLON]
  323. case '#':
  324. return HASH, tokens[HASH]
  325. case ';':
  326. return SEMICOLON, tokens[SEMICOLON]
  327. }
  328. return ILLEGAL, ""
  329. }
  330. func (s *Scanner) ScanIdent() (tok Token, lit string) {
  331. var buf bytes.Buffer
  332. buf.WriteRune(s.read())
  333. for {
  334. if ch := s.read(); ch == eof {
  335. break
  336. } else if !isLetter(ch) && !isDigit(ch) && ch != '_' {
  337. s.unread()
  338. break
  339. } else {
  340. buf.WriteRune(ch)
  341. }
  342. }
  343. switch lit = strings.ToUpper(buf.String()); lit {
  344. case "SELECT":
  345. return SELECT, lit
  346. case "AS":
  347. return AS, lit
  348. case "FROM":
  349. return FROM, lit
  350. case "WHERE":
  351. return WHERE, lit
  352. case "AND":
  353. return AND, lit
  354. case "OR":
  355. return OR, lit
  356. case "GROUP":
  357. return GROUP, lit
  358. case "HAVING":
  359. return HAVING, lit
  360. case "ORDER":
  361. return ORDER, lit
  362. case "BY":
  363. return BY, lit
  364. case "DESC":
  365. return DESC, lit
  366. case "ASC":
  367. return ASC, lit
  368. case "FILTER":
  369. return FILTER, lit
  370. case "INNER":
  371. return INNER, lit
  372. case "LEFT":
  373. return LEFT, lit
  374. case "RIGHT":
  375. return RIGHT, lit
  376. case "FULL":
  377. return FULL, lit
  378. case "CROSS":
  379. return CROSS, lit
  380. case "JOIN":
  381. return JOIN, lit
  382. case "ON":
  383. return ON, lit
  384. case "CASE":
  385. return CASE, lit
  386. case "WHEN":
  387. return WHEN, lit
  388. case "THEN":
  389. return THEN, lit
  390. case "ELSE":
  391. return ELSE, lit
  392. case "END":
  393. return END, lit
  394. case "CREATE":
  395. return CREATE, lit
  396. case "DROP":
  397. return DROP, lit
  398. case "EXPLAIN":
  399. return EXPLAIN, lit
  400. case "DESCRIBE":
  401. return DESCRIBE, lit
  402. case "SHOW":
  403. return SHOW, lit
  404. case "STREAM":
  405. return STREAM, lit
  406. case "STREAMS":
  407. return STREAMS, lit
  408. case "TABLE":
  409. return TABLE, lit
  410. case "TABLES":
  411. return TABLES, lit
  412. case "WITH":
  413. return WITH, lit
  414. case "BIGINT":
  415. return XBIGINT, lit
  416. case "FLOAT":
  417. return XFLOAT, lit
  418. case "DATETIME":
  419. return XDATETIME, lit
  420. case "STRING":
  421. return XSTRING, lit
  422. case "BYTEA":
  423. return XBYTEA, lit
  424. case "BOOLEAN":
  425. return XBOOLEAN, lit
  426. case "ARRAY":
  427. return XARRAY, lit
  428. case "STRUCT":
  429. return XSTRUCT, lit
  430. case "DATASOURCE":
  431. return DATASOURCE, lit
  432. case "KEY":
  433. return KEY, lit
  434. case "FORMAT":
  435. return FORMAT, lit
  436. case "CONF_KEY":
  437. return CONF_KEY, lit
  438. case "TYPE":
  439. return TYPE, lit
  440. case "TRUE":
  441. return TRUE, lit
  442. case "FALSE":
  443. return FALSE, lit
  444. case "STRICT_VALIDATION":
  445. return STRICT_VALIDATION, lit
  446. case "TIMESTAMP":
  447. return TIMESTAMP, lit
  448. case "TIMESTAMP_FORMAT":
  449. return TIMESTAMP_FORMAT, lit
  450. case "RETAIN_SIZE":
  451. return RETAIN_SIZE, lit
  452. case "DD":
  453. return DD, lit
  454. case "HH":
  455. return HH, lit
  456. case "MI":
  457. return MI, lit
  458. case "SS":
  459. return SS, lit
  460. case "MS":
  461. return MS, lit
  462. }
  463. return IDENT, buf.String()
  464. }
  465. func (s *Scanner) ScanString() (tok Token, lit string) {
  466. var buf bytes.Buffer
  467. ch := s.read()
  468. buf.WriteRune(ch)
  469. escape := false
  470. for {
  471. ch = s.read()
  472. if ch == '"' && !escape {
  473. buf.WriteRune(ch)
  474. break
  475. } else if ch == eof {
  476. return BADSTRING, buf.String()
  477. } else if ch == '\\' && !escape {
  478. escape = true
  479. buf.WriteRune(ch)
  480. } else {
  481. escape = false
  482. buf.WriteRune(ch)
  483. }
  484. }
  485. r, _ := strconv.Unquote(buf.String())
  486. return STRING, r
  487. }
  488. func (s *Scanner) ScanDigit() (tok Token, lit string) {
  489. var buf bytes.Buffer
  490. ch := s.read()
  491. buf.WriteRune(ch)
  492. for {
  493. if ch := s.read(); isDigit(ch) {
  494. buf.WriteRune(ch)
  495. } else {
  496. s.unread()
  497. break
  498. }
  499. }
  500. return INTEGER, buf.String()
  501. }
  502. func (s *Scanner) ScanNumber(startWithDot bool, isNeg bool) (tok Token, lit string) {
  503. var buf bytes.Buffer
  504. if isNeg {
  505. buf.WriteRune('-')
  506. }
  507. if startWithDot {
  508. buf.WriteRune('.')
  509. }
  510. ch := s.read()
  511. buf.WriteRune(ch)
  512. isNum := false
  513. for {
  514. if ch := s.read(); isDigit(ch) {
  515. buf.WriteRune(ch)
  516. } else if ch == '.' {
  517. isNum = true
  518. buf.WriteRune(ch)
  519. } else {
  520. s.unread()
  521. break
  522. }
  523. }
  524. if isNum || startWithDot {
  525. return NUMBER, buf.String()
  526. } else {
  527. return INTEGER, buf.String()
  528. }
  529. }
  530. func (s *Scanner) ScanBackquoteIdent() (tok Token, lit string) {
  531. var buf bytes.Buffer
  532. for {
  533. ch := s.read()
  534. if isBackquote(ch) || ch == eof {
  535. break
  536. } else {
  537. buf.WriteRune(ch)
  538. }
  539. }
  540. return IDENT, buf.String()
  541. }
  542. func (s *Scanner) skipUntilNewline() {
  543. for {
  544. if ch := s.read(); ch == '\n' || ch == eof {
  545. return
  546. }
  547. }
  548. }
  549. func (s *Scanner) skipUntilEndComment() error {
  550. for {
  551. if ch1 := s.read(); ch1 == '*' {
  552. // We might be at the end.
  553. star:
  554. ch2 := s.read()
  555. if ch2 == '/' {
  556. return nil
  557. } else if ch2 == '*' {
  558. // We are back in the state machine since we see a star.
  559. goto star
  560. } else if ch2 == eof {
  561. return io.EOF
  562. }
  563. } else if ch1 == eof {
  564. return io.EOF
  565. }
  566. }
  567. }
  568. func (s *Scanner) ScanWhiteSpace() (tok Token, lit string) {
  569. var buf bytes.Buffer
  570. for {
  571. if ch := s.read(); ch == eof {
  572. break
  573. } else if !isWhiteSpace(ch) {
  574. s.unread()
  575. break
  576. } else {
  577. buf.WriteRune(ch)
  578. }
  579. }
  580. return WS, buf.String()
  581. }
  582. func (s *Scanner) read() rune {
  583. ch, _, err := s.r.ReadRune()
  584. if err != nil {
  585. return eof
  586. }
  587. return ch
  588. }
  589. func (s *Scanner) unread() {
  590. _ = s.r.UnreadRune()
  591. }
  592. var eof = rune(0)
  593. func isWhiteSpace(r rune) bool {
  594. return (r == ' ') || (r == '\t') || (r == '\r') || (r == '\n')
  595. }
  596. func isLetter(ch rune) bool { return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') }
  597. func isDigit(ch rune) bool { return ch >= '0' && ch <= '9' }
  598. func isQuotation(ch rune) bool { return ch == '"' }
  599. func isBackquote(ch rune) bool { return ch == '`' }
  600. func (tok Token) isOperator() bool {
  601. return (tok > operatorBeg && tok < operatorEnd) || tok == ASTERISK || tok == LBRACKET
  602. }
  603. func (tok Token) isTimeLiteral() bool { return tok >= DD && tok <= MS }
  604. func (tok Token) allowedSourceToken() bool {
  605. return tok == IDENT || tok == DIV || tok == HASH || tok == ADD
  606. }
  607. //Allowed special field name token
  608. func (tok Token) allowedSFNToken() bool { return tok == DOT }
  609. func (tok Token) Precedence() int {
  610. switch tok {
  611. case OR:
  612. return 1
  613. case AND:
  614. return 2
  615. case EQ, NEQ, LT, LTE, GT, GTE:
  616. return 3
  617. case ADD, SUB, BITWISE_OR, BITWISE_XOR:
  618. return 4
  619. case MUL, DIV, MOD, BITWISE_AND, SUBSET, ARROW:
  620. return 5
  621. }
  622. return 0
  623. }
  624. type DataType int
  625. const (
  626. UNKNOWN DataType = iota
  627. BIGINT
  628. FLOAT
  629. STRINGS
  630. BYTEA
  631. DATETIME
  632. BOOLEAN
  633. ARRAY
  634. STRUCT
  635. )
  636. var dataTypes = []string{
  637. BIGINT: "bigint",
  638. FLOAT: "float",
  639. STRINGS: "string",
  640. BYTEA: "bytea",
  641. DATETIME: "datetime",
  642. BOOLEAN: "boolean",
  643. ARRAY: "array",
  644. STRUCT: "struct",
  645. }
  646. func (d DataType) isSimpleType() bool {
  647. return d >= BIGINT && d <= BOOLEAN
  648. }
  649. func (d DataType) String() string {
  650. if d >= 0 && d < DataType(len(dataTypes)) {
  651. return dataTypes[d]
  652. }
  653. return ""
  654. }
  655. func getDataType(tok Token) DataType {
  656. switch tok {
  657. case XBIGINT:
  658. return BIGINT
  659. case XFLOAT:
  660. return FLOAT
  661. case XSTRING:
  662. return STRINGS
  663. case XBYTEA:
  664. return BYTEA
  665. case XDATETIME:
  666. return DATETIME
  667. case XBOOLEAN:
  668. return BOOLEAN
  669. case XARRAY:
  670. return ARRAY
  671. case XSTRUCT:
  672. return STRUCT
  673. }
  674. return UNKNOWN
  675. }