lexical.go 11 KB


  1. package xsql
  2. import (
  3. "bufio"
  4. "bytes"
  5. "io"
  6. "strconv"
  7. "strings"
  8. )
  9. type Token int
  10. const (
  11. // Special tokens
  12. ILLEGAL Token = iota
  13. EOF
  14. WS
  15. COMMENT
  16. AS
  17. // Literals
  18. IDENT // main
  19. INTEGER // 12345
  20. NUMBER //12345.67
  21. STRING // "abc"
  22. BADSTRING // "abc
  23. operatorBeg
  24. // ADD and the following are InfluxQL Operators
  25. ADD // +
  26. SUB // -
  27. MUL // *
  28. DIV // /
  29. MOD // %
  30. BITWISE_AND // &
  31. BITWISE_OR // |
  32. BITWISE_XOR // ^
  33. AND // AND
  34. OR // OR
  35. EQ // =
  36. NEQ // !=
  37. LT // <
  38. LTE // <=
  39. GT // >
  40. GTE // >=
  41. SUBSET //[
  42. ARROW //->
  43. operatorEnd
  44. // Misc characters
  45. ASTERISK // *
  46. COMMA // ,
  47. LPAREN // (
  48. RPAREN // )
  49. LBRACKET //[
  50. RBRACKET //]
  51. HASH // #
  52. DOT // .
  53. COLON //:
  54. SEMICOLON //;
  55. COLSEP //\007
  56. // Keywords
  57. SELECT
  58. FROM
  59. JOIN
  60. INNER
  61. LEFT
  62. RIGHT
  63. FULL
  64. CROSS
  65. ON
  66. WHERE
  67. GROUP
  68. ORDER
  69. HAVING
  70. BY
  71. ASC
  72. DESC
  73. FILTER
  74. TRUE
  75. FALSE
  76. CREATE
  77. DROP
  78. EXPLAIN
  79. DESCRIBE
  80. SHOW
  81. STREAM
  82. STREAMS
  83. WITH
  84. XBIGINT
  85. XFLOAT
  86. XSTRING
  87. XDATETIME
  88. XBOOLEAN
  89. XARRAY
  90. XSTRUCT
  91. DATASOURCE
  92. KEY
  93. FORMAT
  94. CONF_KEY
  95. TYPE
  96. STRICT_VALIDATION
  97. TIMESTAMP
  98. TIMESTAMP_FORMAT
  99. DD
  100. HH
  101. MI
  102. SS
  103. MS
  104. )
  105. var tokens = []string{
  106. ILLEGAL: "ILLEGAL",
  107. EOF: "EOF",
  108. AS: "AS",
  109. WS: "WS",
  110. IDENT: "IDENT",
  111. INTEGER: "INTEGER",
  112. NUMBER: "NUMBER",
  113. STRING: "STRING",
  114. ADD: "+",
  115. SUB: "-",
  116. MUL: "*",
  117. DIV: "/",
  118. MOD: "%",
  119. BITWISE_AND: "&",
  120. BITWISE_OR: "|",
  121. BITWISE_XOR: "^",
  122. EQ: "=",
  123. NEQ: "!=",
  124. LT: "<",
  125. LTE: "<=",
  126. GT: ">",
  127. GTE: ">=",
  128. SUBSET: "[]",
  129. ARROW: "->",
  130. ASTERISK: "*",
  131. COMMA: ",",
  132. LPAREN: "(",
  133. RPAREN: ")",
  134. LBRACKET: "[",
  135. RBRACKET: "]",
  136. HASH: "#",
  137. DOT: ".",
  138. SEMICOLON: ";",
  139. COLON: ":",
  140. COLSEP: "\007",
  141. SELECT: "SELECT",
  142. FROM: "FROM",
  143. JOIN: "JOIN",
  144. LEFT: "LEFT",
  145. INNER: "INNER",
  146. ON: "ON",
  147. WHERE: "WHERE",
  148. GROUP: "GROUP",
  149. ORDER: "ORDER",
  150. HAVING: "HAVING",
  151. BY: "BY",
  152. ASC: "ASC",
  153. DESC: "DESC",
  154. CREATE: "CREATE",
  155. DROP: "RROP",
  156. EXPLAIN: "EXPLAIN",
  157. DESCRIBE: "DESCRIBE",
  158. SHOW: "SHOW",
  159. STREAM: "STREAM",
  160. STREAMS: "STREAMS",
  161. WITH: "WITH",
  162. XBIGINT: "BIGINT",
  163. XFLOAT: "FLOAT",
  164. XSTRING: "STRING",
  165. XDATETIME: "DATETIME",
  166. XBOOLEAN: "BOOLEAN",
  167. XARRAY: "ARRAY",
  168. XSTRUCT: "STRUCT",
  169. DATASOURCE: "DATASOURCE",
  170. KEY: "KEY",
  171. FORMAT: "FORMAT",
  172. CONF_KEY: "CONF_KEY",
  173. TYPE: "TYPE",
  174. STRICT_VALIDATION: "STRICT_VALIDATION",
  175. TIMESTAMP: "TIMESTAMP",
  176. TIMESTAMP_FORMAT: "TIMESTAMP_FORMAT",
  177. AND: "AND",
  178. OR: "OR",
  179. TRUE: "TRUE",
  180. FALSE: "FALSE",
  181. DD: "DD",
  182. HH: "HH",
  183. MI: "MI",
  184. SS: "SS",
  185. MS: "MS",
  186. }
  187. func (tok Token) String() string {
  188. if tok >= 0 && tok < Token(len(tokens)) {
  189. return tokens[tok]
  190. }
  191. return ""
  192. }
  193. type Scanner struct {
  194. r *bufio.Reader
  195. }
  196. func NewScanner(r io.Reader) *Scanner {
  197. return &Scanner{r: bufio.NewReader(r)}
  198. }
  199. func (s *Scanner) Scan() (tok Token, lit string) {
  200. ch := s.read()
  201. if isWhiteSpace(ch) {
  202. //s.unread()
  203. return s.ScanWhiteSpace()
  204. } else if isLetter(ch) {
  205. s.unread()
  206. return s.ScanIdent()
  207. } else if isQuotation(ch) {
  208. s.unread()
  209. return s.ScanString()
  210. } else if isDigit(ch) {
  211. s.unread()
  212. return s.ScanNumber(false, false)
  213. } else if isBackquote(ch) {
  214. return s.ScanBackquoteIdent()
  215. }
  216. switch ch {
  217. case eof:
  218. return EOF, tokens[EOF]
  219. case '=':
  220. return EQ, tokens[EQ]
  221. case '!':
  222. _, _ = s.ScanWhiteSpace()
  223. if r := s.read(); r == '=' {
  224. return NEQ, tokens[NEQ]
  225. } else {
  226. s.unread()
  227. }
  228. return EQ, tokens[EQ]
  229. case '<':
  230. _, _ = s.ScanWhiteSpace()
  231. if r := s.read(); r == '=' {
  232. return LTE, tokens[LTE]
  233. } else {
  234. s.unread()
  235. }
  236. return LT, tokens[LT]
  237. case '>':
  238. _, _ = s.ScanWhiteSpace()
  239. if r := s.read(); r == '=' {
  240. return GTE, tokens[GTE]
  241. } else {
  242. s.unread()
  243. }
  244. return GT, tokens[GT]
  245. case '+':
  246. return ADD, tokens[ADD]
  247. case '-':
  248. _, _ = s.ScanWhiteSpace()
  249. if r := s.read(); r == '-' {
  250. s.skipUntilNewline()
  251. return COMMENT, ""
  252. } else if r == '>' {
  253. return ARROW, tokens[ARROW]
  254. } else if isDigit(r) {
  255. s.unread()
  256. return s.ScanNumber(false, true)
  257. } else if r == '.' {
  258. _, _ = s.ScanWhiteSpace()
  259. if r1 := s.read(); isDigit(r1) {
  260. s.unread()
  261. return s.ScanNumber(true, true)
  262. } else {
  263. s.unread()
  264. }
  265. s.unread()
  266. } else {
  267. s.unread()
  268. }
  269. return SUB, tokens[SUB]
  270. case '/':
  271. _, _ = s.ScanWhiteSpace()
  272. if r := s.read(); r == '*' {
  273. if err := s.skipUntilEndComment(); err != nil {
  274. return ILLEGAL, ""
  275. }
  276. return COMMENT, ""
  277. } else {
  278. s.unread()
  279. }
  280. return DIV, tokens[DIV]
  281. case '.':
  282. if r := s.read(); isDigit(r) {
  283. s.unread()
  284. return s.ScanNumber(true, false)
  285. }
  286. s.unread()
  287. return DOT, tokens[DOT]
  288. case '%':
  289. return MOD, tokens[MOD]
  290. case '&':
  291. return BITWISE_AND, tokens[BITWISE_AND]
  292. case '|':
  293. return BITWISE_OR, tokens[BITWISE_OR]
  294. case '^':
  295. return BITWISE_XOR, tokens[BITWISE_XOR]
  296. case '*':
  297. return ASTERISK, tokens[ASTERISK]
  298. case ',':
  299. return COMMA, tokens[COMMA]
  300. case '(':
  301. return LPAREN, tokens[LPAREN]
  302. case ')':
  303. return RPAREN, tokens[RPAREN]
  304. case '[':
  305. return LBRACKET, tokens[LBRACKET]
  306. case ']':
  307. return RBRACKET, tokens[RBRACKET]
  308. case ':':
  309. return COLON, tokens[COLON]
  310. case '#':
  311. return HASH, tokens[HASH]
  312. case ';':
  313. return SEMICOLON, tokens[SEMICOLON]
  314. }
  315. return ILLEGAL, ""
  316. }
  317. func (s *Scanner) ScanIdent() (tok Token, lit string) {
  318. var buf bytes.Buffer
  319. buf.WriteRune(s.read())
  320. for {
  321. if ch := s.read(); ch == eof {
  322. break
  323. } else if !isLetter(ch) && !isDigit(ch) && ch != '_' {
  324. s.unread()
  325. break
  326. } else {
  327. buf.WriteRune(ch)
  328. }
  329. }
  330. switch lit = strings.ToUpper(buf.String()); lit {
  331. case "SELECT":
  332. return SELECT, lit
  333. case "AS":
  334. return AS, lit
  335. case "FROM":
  336. return FROM, lit
  337. case "WHERE":
  338. return WHERE, lit
  339. case "AND":
  340. return AND, lit
  341. case "OR":
  342. return OR, lit
  343. case "GROUP":
  344. return GROUP, lit
  345. case "HAVING":
  346. return HAVING, lit
  347. case "ORDER":
  348. return ORDER, lit
  349. case "BY":
  350. return BY, lit
  351. case "DESC":
  352. return DESC, lit
  353. case "ASC":
  354. return ASC, lit
  355. case "FILTER":
  356. return FILTER, lit
  357. case "INNER":
  358. return INNER, lit
  359. case "LEFT":
  360. return LEFT, lit
  361. case "RIGHT":
  362. return RIGHT, lit
  363. case "FULL":
  364. return FULL, lit
  365. case "CROSS":
  366. return CROSS, lit
  367. case "JOIN":
  368. return JOIN, lit
  369. case "ON":
  370. return ON, lit
  371. case "CREATE":
  372. return CREATE, lit
  373. case "DROP":
  374. return DROP, lit
  375. case "EXPLAIN":
  376. return EXPLAIN, lit
  377. case "DESCRIBE":
  378. return DESCRIBE, lit
  379. case "SHOW":
  380. return SHOW, lit
  381. case "STREAM":
  382. return STREAM, lit
  383. case "STREAMS":
  384. return STREAMS, lit
  385. case "WITH":
  386. return WITH, lit
  387. case "BIGINT":
  388. return XBIGINT, lit
  389. case "FLOAT":
  390. return XFLOAT, lit
  391. case "DATETIME":
  392. return XDATETIME, lit
  393. case "STRING":
  394. return XSTRING, lit
  395. case "BOOLEAN":
  396. return XBOOLEAN, lit
  397. case "ARRAY":
  398. return XARRAY, lit
  399. case "STRUCT":
  400. return XSTRUCT, lit
  401. case "DATASOURCE":
  402. return DATASOURCE, lit
  403. case "KEY":
  404. return KEY, lit
  405. case "FORMAT":
  406. return FORMAT, lit
  407. case "CONF_KEY":
  408. return CONF_KEY, lit
  409. case "TYPE":
  410. return TYPE, lit
  411. case "TRUE":
  412. return TRUE, lit
  413. case "FALSE":
  414. return FALSE, lit
  415. case "STRICT_VALIDATION":
  416. return STRICT_VALIDATION, lit
  417. case "TIMESTAMP":
  418. return TIMESTAMP, lit
  419. case "TIMESTAMP_FORMAT":
  420. return TIMESTAMP_FORMAT, lit
  421. case "DD":
  422. return DD, lit
  423. case "HH":
  424. return HH, lit
  425. case "MI":
  426. return MI, lit
  427. case "SS":
  428. return SS, lit
  429. case "MS":
  430. return MS, lit
  431. }
  432. return IDENT, buf.String()
  433. }
  434. func (s *Scanner) ScanString() (tok Token, lit string) {
  435. var buf bytes.Buffer
  436. ch := s.read()
  437. buf.WriteRune(ch)
  438. escape := false
  439. for {
  440. ch = s.read()
  441. if ch == '"' && !escape {
  442. buf.WriteRune(ch)
  443. break
  444. } else if ch == eof {
  445. return BADSTRING, buf.String()
  446. } else if ch == '\\' && !escape {
  447. escape = true
  448. buf.WriteRune(ch)
  449. } else {
  450. escape = false
  451. buf.WriteRune(ch)
  452. }
  453. }
  454. r, _ := strconv.Unquote(buf.String())
  455. return STRING, r
  456. }
  457. func (s *Scanner) ScanDigit() (tok Token, lit string) {
  458. var buf bytes.Buffer
  459. ch := s.read()
  460. buf.WriteRune(ch)
  461. for {
  462. if ch := s.read(); isDigit(ch) {
  463. buf.WriteRune(ch)
  464. } else {
  465. s.unread()
  466. break
  467. }
  468. }
  469. return INTEGER, buf.String()
  470. }
  471. func (s *Scanner) ScanNumber(startWithDot bool, isNeg bool) (tok Token, lit string) {
  472. var buf bytes.Buffer
  473. if isNeg {
  474. buf.WriteRune('-')
  475. }
  476. if startWithDot {
  477. buf.WriteRune('.')
  478. }
  479. ch := s.read()
  480. buf.WriteRune(ch)
  481. isNum := false
  482. for {
  483. if ch := s.read(); isDigit(ch) {
  484. buf.WriteRune(ch)
  485. } else if ch == '.' {
  486. isNum = true
  487. buf.WriteRune(ch)
  488. } else {
  489. s.unread()
  490. break
  491. }
  492. }
  493. if isNum || startWithDot {
  494. return NUMBER, buf.String()
  495. } else {
  496. return INTEGER, buf.String()
  497. }
  498. }
  499. func (s *Scanner) ScanBackquoteIdent() (tok Token, lit string) {
  500. var buf bytes.Buffer
  501. for {
  502. ch := s.read()
  503. if isBackquote(ch) || ch == eof {
  504. break
  505. } else {
  506. buf.WriteRune(ch)
  507. }
  508. }
  509. return IDENT, buf.String()
  510. }
  511. func (s *Scanner) skipUntilNewline() {
  512. for {
  513. if ch := s.read(); ch == '\n' || ch == eof {
  514. return
  515. }
  516. }
  517. }
  518. func (s *Scanner) skipUntilEndComment() error {
  519. for {
  520. if ch1 := s.read(); ch1 == '*' {
  521. // We might be at the end.
  522. star:
  523. ch2 := s.read()
  524. if ch2 == '/' {
  525. return nil
  526. } else if ch2 == '*' {
  527. // We are back in the state machine since we see a star.
  528. goto star
  529. } else if ch2 == eof {
  530. return io.EOF
  531. }
  532. } else if ch1 == eof {
  533. return io.EOF
  534. }
  535. }
  536. }
  537. func (s *Scanner) ScanWhiteSpace() (tok Token, lit string) {
  538. var buf bytes.Buffer
  539. for {
  540. if ch := s.read(); ch == eof {
  541. break
  542. } else if !isWhiteSpace(ch) {
  543. s.unread()
  544. break
  545. } else {
  546. buf.WriteRune(ch)
  547. }
  548. }
  549. return WS, buf.String()
  550. }
  551. func (s *Scanner) read() rune {
  552. ch, _, err := s.r.ReadRune()
  553. if err != nil {
  554. return eof
  555. }
  556. return ch
  557. }
  558. func (s *Scanner) unread() {
  559. _ = s.r.UnreadRune()
  560. }
  561. var eof = rune(0)
  562. func isWhiteSpace(r rune) bool {
  563. return (r == ' ') || (r == '\t') || (r == '\r') || (r == '\n')
  564. }
  565. func isLetter(ch rune) bool { return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') }
  566. func isDigit(ch rune) bool { return ch >= '0' && ch <= '9' }
  567. func isQuotation(ch rune) bool { return ch == '"' }
  568. func isBackquote(ch rune) bool { return ch == '`' }
  569. func (tok Token) isOperator() bool {
  570. return (tok > operatorBeg && tok < operatorEnd) || tok == ASTERISK || tok == LBRACKET
  571. }
  572. func (tok Token) isTimeLiteral() bool { return tok >= DD && tok <= MS }
  573. func (tok Token) allowedSourceToken() bool {
  574. return tok == IDENT || tok == DIV || tok == HASH || tok == ADD
  575. }
  576. //Allowed special field name token
  577. func (tok Token) allowedSFNToken() bool { return tok == DOT }
  578. func (tok Token) Precedence() int {
  579. switch tok {
  580. case OR:
  581. return 1
  582. case AND:
  583. return 2
  584. case EQ, NEQ, LT, LTE, GT, GTE:
  585. return 3
  586. case ADD, SUB, BITWISE_OR, BITWISE_XOR:
  587. return 4
  588. case MUL, DIV, MOD, BITWISE_AND, SUBSET, ARROW:
  589. return 5
  590. }
  591. return 0
  592. }
  593. type DataType int
  594. const (
  595. UNKNOWN DataType = iota
  596. BIGINT
  597. FLOAT
  598. STRINGS
  599. DATETIME
  600. BOOLEAN
  601. ARRAY
  602. STRUCT
  603. )
  604. var dataTypes = []string{
  605. BIGINT: "bigint",
  606. FLOAT: "float",
  607. STRINGS: "string",
  608. DATETIME: "datetime",
  609. BOOLEAN: "boolean",
  610. ARRAY: "array",
  611. STRUCT: "struct",
  612. }
  613. func (d DataType) isSimpleType() bool {
  614. return d >= BIGINT && d <= BOOLEAN
  615. }
  616. func (d DataType) String() string {
  617. if d >= 0 && d < DataType(len(dataTypes)) {
  618. return dataTypes[d]
  619. }
  620. return ""
  621. }
  622. func getDataType(tok Token) DataType {
  623. switch tok {
  624. case XBIGINT:
  625. return BIGINT
  626. case XFLOAT:
  627. return FLOAT
  628. case XSTRING:
  629. return STRINGS
  630. case XDATETIME:
  631. return DATETIME
  632. case XBOOLEAN:
  633. return BOOLEAN
  634. case XARRAY:
  635. return ARRAY
  636. case XSTRUCT:
  637. return STRUCT
  638. }
  639. return UNKNOWN
  640. }