lexical.go 11 KB


  1. package xsql
  2. import (
  3. "bufio"
  4. "bytes"
  5. "io"
  6. "strconv"
  7. "strings"
  8. )
  9. type Token int
  10. const (
  11. // Special tokens
  12. ILLEGAL Token = iota
  13. EOF
  14. WS
  15. COMMENT
  16. AS
  17. // Literals
  18. IDENT // main
  19. INTEGER // 12345
  20. NUMBER //12345.67
  21. STRING // "abc"
  22. BADSTRING // "abc
  23. operatorBeg
  24. // ADD and the following are InfluxQL Operators
  25. ADD // +
  26. SUB // -
  27. MUL // *
  28. DIV // /
  29. MOD // %
  30. BITWISE_AND // &
  31. BITWISE_OR // |
  32. BITWISE_XOR // ^
  33. AND // AND
  34. OR // OR
  35. EQ // =
  36. NEQ // !=
  37. LT // <
  38. LTE // <=
  39. GT // >
  40. GTE // >=
  41. SUBSET //[
  42. ARROW //->
  43. operatorEnd
  44. // Misc characters
  45. ASTERISK // *
  46. COMMA // ,
  47. LPAREN // (
  48. RPAREN // )
  49. LBRACKET //[
  50. RBRACKET //]
  51. HASH // #
  52. DOT // .
  53. COLON //:
  54. SEMICOLON //;
  55. COLSEP //\007
  56. // Keywords
  57. SELECT
  58. FROM
  59. JOIN
  60. INNER
  61. LEFT
  62. RIGHT
  63. FULL
  64. CROSS
  65. ON
  66. WHERE
  67. GROUP
  68. ORDER
  69. HAVING
  70. BY
  71. ASC
  72. DESC
  73. TRUE
  74. FALSE
  75. CREATE
  76. DROP
  77. EXPLAIN
  78. DESCRIBE
  79. SHOW
  80. STREAM
  81. STREAMS
  82. WITH
  83. XBIGINT
  84. XFLOAT
  85. XSTRING
  86. XDATETIME
  87. XBOOLEAN
  88. XARRAY
  89. XSTRUCT
  90. DATASOURCE
  91. KEY
  92. FORMAT
  93. CONF_KEY
  94. TYPE
  95. STRICT_VALIDATION
  96. TIMESTAMP
  97. TIMESTAMP_FORMAT
  98. DD
  99. HH
  100. MI
  101. SS
  102. MS
  103. )
  104. var tokens = []string{
  105. ILLEGAL: "ILLEGAL",
  106. EOF: "EOF",
  107. AS: "AS",
  108. WS: "WS",
  109. IDENT: "IDENT",
  110. INTEGER: "INTEGER",
  111. NUMBER: "NUMBER",
  112. STRING: "STRING",
  113. ADD: "+",
  114. SUB: "-",
  115. MUL: "*",
  116. DIV: "/",
  117. MOD: "%",
  118. BITWISE_AND: "&",
  119. BITWISE_OR: "|",
  120. BITWISE_XOR: "^",
  121. EQ: "=",
  122. NEQ: "!=",
  123. LT: "<",
  124. LTE: "<=",
  125. GT: ">",
  126. GTE: ">=",
  127. SUBSET: "[]",
  128. ARROW: "->",
  129. ASTERISK: "*",
  130. COMMA: ",",
  131. LPAREN: "(",
  132. RPAREN: ")",
  133. LBRACKET: "[",
  134. RBRACKET: "]",
  135. HASH: "#",
  136. DOT: ".",
  137. SEMICOLON: ";",
  138. COLON: ":",
  139. COLSEP: "\007",
  140. SELECT: "SELECT",
  141. FROM: "FROM",
  142. JOIN: "JOIN",
  143. LEFT: "LEFT",
  144. INNER: "INNER",
  145. ON: "ON",
  146. WHERE: "WHERE",
  147. GROUP: "GROUP",
  148. ORDER: "ORDER",
  149. HAVING: "HAVING",
  150. BY: "BY",
  151. ASC: "ASC",
  152. DESC: "DESC",
  153. CREATE: "CREATE",
  154. DROP: "RROP",
  155. EXPLAIN: "EXPLAIN",
  156. DESCRIBE: "DESCRIBE",
  157. SHOW: "SHOW",
  158. STREAM: "STREAM",
  159. STREAMS: "STREAMS",
  160. WITH: "WITH",
  161. XBIGINT: "BIGINT",
  162. XFLOAT: "FLOAT",
  163. XSTRING: "STRING",
  164. XDATETIME: "DATETIME",
  165. XBOOLEAN: "BOOLEAN",
  166. XARRAY: "ARRAY",
  167. XSTRUCT: "STRUCT",
  168. DATASOURCE: "DATASOURCE",
  169. KEY: "KEY",
  170. FORMAT: "FORMAT",
  171. CONF_KEY: "CONF_KEY",
  172. TYPE: "TYPE",
  173. STRICT_VALIDATION: "STRICT_VALIDATION",
  174. TIMESTAMP: "TIMESTAMP",
  175. TIMESTAMP_FORMAT: "TIMESTAMP_FORMAT",
  176. AND: "AND",
  177. OR: "OR",
  178. TRUE: "TRUE",
  179. FALSE: "FALSE",
  180. DD: "DD",
  181. HH: "HH",
  182. MI: "MI",
  183. SS: "SS",
  184. MS: "MS",
  185. }
  186. func (tok Token) String() string {
  187. if tok >= 0 && tok < Token(len(tokens)) {
  188. return tokens[tok]
  189. }
  190. return ""
  191. }
  192. type Scanner struct {
  193. r *bufio.Reader
  194. }
  195. func NewScanner(r io.Reader) *Scanner {
  196. return &Scanner{r: bufio.NewReader(r)}
  197. }
  198. func (s *Scanner) Scan() (tok Token, lit string) {
  199. ch := s.read()
  200. if isWhiteSpace(ch) {
  201. //s.unread()
  202. return s.ScanWhiteSpace()
  203. } else if isLetter(ch) {
  204. s.unread()
  205. return s.ScanIdent()
  206. } else if isQuotation(ch) {
  207. s.unread()
  208. return s.ScanString()
  209. } else if isDigit(ch) {
  210. s.unread()
  211. return s.ScanNumber(false, false)
  212. } else if isBackquote(ch) {
  213. return s.ScanBackquoteIdent()
  214. }
  215. switch ch {
  216. case eof:
  217. return EOF, tokens[EOF]
  218. case '=':
  219. return EQ, tokens[EQ]
  220. case '!':
  221. _, _ = s.ScanWhiteSpace()
  222. if r := s.read(); r == '=' {
  223. return NEQ, tokens[NEQ]
  224. } else {
  225. s.unread()
  226. }
  227. return EQ, tokens[EQ]
  228. case '<':
  229. _, _ = s.ScanWhiteSpace()
  230. if r := s.read(); r == '=' {
  231. return LTE, tokens[LTE]
  232. } else {
  233. s.unread()
  234. }
  235. return LT, tokens[LT]
  236. case '>':
  237. _, _ = s.ScanWhiteSpace()
  238. if r := s.read(); r == '=' {
  239. return GTE, tokens[GTE]
  240. } else {
  241. s.unread()
  242. }
  243. return GT, tokens[GT]
  244. case '+':
  245. return ADD, tokens[ADD]
  246. case '-':
  247. _, _ = s.ScanWhiteSpace()
  248. if r := s.read(); r == '-' {
  249. s.skipUntilNewline()
  250. return COMMENT, ""
  251. } else if r == '>' {
  252. return ARROW, tokens[ARROW]
  253. } else if isDigit(r) {
  254. s.unread()
  255. return s.ScanNumber(false, true)
  256. } else if r == '.' {
  257. _, _ = s.ScanWhiteSpace()
  258. if r1 := s.read(); isDigit(r1) {
  259. s.unread()
  260. return s.ScanNumber(true, true)
  261. } else {
  262. s.unread()
  263. }
  264. s.unread()
  265. } else {
  266. s.unread()
  267. }
  268. return SUB, tokens[SUB]
  269. case '/':
  270. _, _ = s.ScanWhiteSpace()
  271. if r := s.read(); r == '*' {
  272. if err := s.skipUntilEndComment(); err != nil {
  273. return ILLEGAL, ""
  274. }
  275. return COMMENT, ""
  276. } else {
  277. s.unread()
  278. }
  279. return DIV, tokens[DIV]
  280. case '.':
  281. if r := s.read(); isDigit(r) {
  282. s.unread()
  283. return s.ScanNumber(true, false)
  284. }
  285. s.unread()
  286. return DOT, tokens[DOT]
  287. case '%':
  288. return MOD, tokens[MOD]
  289. case '&':
  290. return BITWISE_AND, tokens[BITWISE_AND]
  291. case '|':
  292. return BITWISE_OR, tokens[BITWISE_OR]
  293. case '^':
  294. return BITWISE_XOR, tokens[BITWISE_XOR]
  295. case '*':
  296. return ASTERISK, tokens[ASTERISK]
  297. case ',':
  298. return COMMA, tokens[COMMA]
  299. case '(':
  300. return LPAREN, tokens[LPAREN]
  301. case ')':
  302. return RPAREN, tokens[RPAREN]
  303. case '[':
  304. return LBRACKET, tokens[LBRACKET]
  305. case ']':
  306. return RBRACKET, tokens[RBRACKET]
  307. case ':':
  308. return COLON, tokens[COLON]
  309. case '#':
  310. return HASH, tokens[HASH]
  311. case ';':
  312. return SEMICOLON, tokens[SEMICOLON]
  313. }
  314. return ILLEGAL, ""
  315. }
  316. func (s *Scanner) ScanIdent() (tok Token, lit string) {
  317. var buf bytes.Buffer
  318. buf.WriteRune(s.read())
  319. for {
  320. if ch := s.read(); ch == eof {
  321. break
  322. } else if !isLetter(ch) && !isDigit(ch) && ch != '_' {
  323. s.unread()
  324. break
  325. } else {
  326. buf.WriteRune(ch)
  327. }
  328. }
  329. switch lit = strings.ToUpper(buf.String()); lit {
  330. case "SELECT":
  331. return SELECT, lit
  332. case "AS":
  333. return AS, lit
  334. case "FROM":
  335. return FROM, lit
  336. case "WHERE":
  337. return WHERE, lit
  338. case "AND":
  339. return AND, lit
  340. case "OR":
  341. return OR, lit
  342. case "GROUP":
  343. return GROUP, lit
  344. case "HAVING":
  345. return HAVING, lit
  346. case "ORDER":
  347. return ORDER, lit
  348. case "BY":
  349. return BY, lit
  350. case "DESC":
  351. return DESC, lit
  352. case "ASC":
  353. return ASC, lit
  354. case "INNER":
  355. return INNER, lit
  356. case "LEFT":
  357. return LEFT, lit
  358. case "RIGHT":
  359. return RIGHT, lit
  360. case "FULL":
  361. return FULL, lit
  362. case "CROSS":
  363. return CROSS, lit
  364. case "JOIN":
  365. return JOIN, lit
  366. case "ON":
  367. return ON, lit
  368. case "CREATE":
  369. return CREATE, lit
  370. case "DROP":
  371. return DROP, lit
  372. case "EXPLAIN":
  373. return EXPLAIN, lit
  374. case "DESCRIBE":
  375. return DESCRIBE, lit
  376. case "SHOW":
  377. return SHOW, lit
  378. case "STREAM":
  379. return STREAM, lit
  380. case "STREAMS":
  381. return STREAMS, lit
  382. case "WITH":
  383. return WITH, lit
  384. case "BIGINT":
  385. return XBIGINT, lit
  386. case "FLOAT":
  387. return XFLOAT, lit
  388. case "DATETIME":
  389. return XDATETIME, lit
  390. case "STRING":
  391. return XSTRING, lit
  392. case "BOOLEAN":
  393. return XBOOLEAN, lit
  394. case "ARRAY":
  395. return XARRAY, lit
  396. case "STRUCT":
  397. return XSTRUCT, lit
  398. case "DATASOURCE":
  399. return DATASOURCE, lit
  400. case "KEY":
  401. return KEY, lit
  402. case "FORMAT":
  403. return FORMAT, lit
  404. case "CONF_KEY":
  405. return CONF_KEY, lit
  406. case "TYPE":
  407. return TYPE, lit
  408. case "TRUE":
  409. return TRUE, lit
  410. case "FALSE":
  411. return FALSE, lit
  412. case "STRICT_VALIDATION":
  413. return STRICT_VALIDATION, lit
  414. case "TIMESTAMP":
  415. return TIMESTAMP, lit
  416. case "TIMESTAMP_FORMAT":
  417. return TIMESTAMP_FORMAT, lit
  418. case "DD":
  419. return DD, lit
  420. case "HH":
  421. return HH, lit
  422. case "MI":
  423. return MI, lit
  424. case "SS":
  425. return SS, lit
  426. case "MS":
  427. return MS, lit
  428. }
  429. return IDENT, buf.String()
  430. }
  431. func (s *Scanner) ScanString() (tok Token, lit string) {
  432. var buf bytes.Buffer
  433. ch := s.read()
  434. buf.WriteRune(ch)
  435. escape := false
  436. for {
  437. ch = s.read()
  438. if ch == '"' && !escape {
  439. buf.WriteRune(ch)
  440. break
  441. } else if ch == eof {
  442. return BADSTRING, buf.String()
  443. } else if ch == '\\' && !escape {
  444. escape = true
  445. buf.WriteRune(ch)
  446. } else {
  447. escape = false
  448. buf.WriteRune(ch)
  449. }
  450. }
  451. r, _ := strconv.Unquote(buf.String())
  452. return STRING, r
  453. }
  454. func (s *Scanner) ScanDigit() (tok Token, lit string) {
  455. var buf bytes.Buffer
  456. ch := s.read()
  457. buf.WriteRune(ch)
  458. for {
  459. if ch := s.read(); isDigit(ch) {
  460. buf.WriteRune(ch)
  461. } else {
  462. s.unread()
  463. break
  464. }
  465. }
  466. return INTEGER, buf.String()
  467. }
  468. func (s *Scanner) ScanNumber(startWithDot bool, isNeg bool) (tok Token, lit string) {
  469. var buf bytes.Buffer
  470. if isNeg {
  471. buf.WriteRune('-')
  472. }
  473. if startWithDot {
  474. buf.WriteRune('.')
  475. }
  476. ch := s.read()
  477. buf.WriteRune(ch)
  478. isNum := false
  479. for {
  480. if ch := s.read(); isDigit(ch) {
  481. buf.WriteRune(ch)
  482. } else if ch == '.' {
  483. isNum = true
  484. buf.WriteRune(ch)
  485. } else {
  486. s.unread()
  487. break
  488. }
  489. }
  490. if isNum || startWithDot {
  491. return NUMBER, buf.String()
  492. } else {
  493. return INTEGER, buf.String()
  494. }
  495. }
  496. func (s *Scanner) ScanBackquoteIdent() (tok Token, lit string) {
  497. var buf bytes.Buffer
  498. for {
  499. ch := s.read()
  500. if isBackquote(ch) || ch == eof {
  501. break
  502. } else {
  503. buf.WriteRune(ch)
  504. }
  505. }
  506. return IDENT, buf.String()
  507. }
  508. func (s *Scanner) skipUntilNewline() {
  509. for {
  510. if ch := s.read(); ch == '\n' || ch == eof {
  511. return
  512. }
  513. }
  514. }
  515. func (s *Scanner) skipUntilEndComment() error {
  516. for {
  517. if ch1 := s.read(); ch1 == '*' {
  518. // We might be at the end.
  519. star:
  520. ch2 := s.read()
  521. if ch2 == '/' {
  522. return nil
  523. } else if ch2 == '*' {
  524. // We are back in the state machine since we see a star.
  525. goto star
  526. } else if ch2 == eof {
  527. return io.EOF
  528. }
  529. } else if ch1 == eof {
  530. return io.EOF
  531. }
  532. }
  533. }
  534. func (s *Scanner) ScanWhiteSpace() (tok Token, lit string) {
  535. var buf bytes.Buffer
  536. for {
  537. if ch := s.read(); ch == eof {
  538. break
  539. } else if !isWhiteSpace(ch) {
  540. s.unread()
  541. break
  542. } else {
  543. buf.WriteRune(ch)
  544. }
  545. }
  546. return WS, buf.String()
  547. }
  548. func (s *Scanner) read() rune {
  549. ch, _, err := s.r.ReadRune()
  550. if err != nil {
  551. return eof
  552. }
  553. return ch
  554. }
  555. func (s *Scanner) unread() {
  556. _ = s.r.UnreadRune()
  557. }
  558. var eof = rune(0)
  559. func isWhiteSpace(r rune) bool {
  560. return (r == ' ') || (r == '\t') || (r == '\r') || (r == '\n')
  561. }
  562. func isLetter(ch rune) bool { return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') }
  563. func isDigit(ch rune) bool { return ch >= '0' && ch <= '9' }
  564. func isQuotation(ch rune) bool { return ch == '"' }
  565. func isBackquote(ch rune) bool { return ch == '`' }
  566. func (tok Token) isOperator() bool {
  567. return (tok > operatorBeg && tok < operatorEnd) || tok == ASTERISK || tok == LBRACKET
  568. }
  569. func (tok Token) isTimeLiteral() bool { return tok >= DD && tok <= MS }
  570. func (tok Token) allowedSourceToken() bool {
  571. return tok == IDENT || tok == DIV || tok == HASH || tok == ADD
  572. }
  573. //Allowed special field name token
  574. func (tok Token) allowedSFNToken() bool { return tok == DOT }
  575. func (tok Token) Precedence() int {
  576. switch tok {
  577. case OR:
  578. return 1
  579. case AND:
  580. return 2
  581. case EQ, NEQ, LT, LTE, GT, GTE:
  582. return 3
  583. case ADD, SUB, BITWISE_OR, BITWISE_XOR:
  584. return 4
  585. case MUL, DIV, MOD, BITWISE_AND, SUBSET, ARROW:
  586. return 5
  587. }
  588. return 0
  589. }
  590. type DataType int
  591. const (
  592. UNKNOWN DataType = iota
  593. BIGINT
  594. FLOAT
  595. STRINGS
  596. DATETIME
  597. BOOLEAN
  598. ARRAY
  599. STRUCT
  600. )
  601. var dataTypes = []string{
  602. BIGINT: "bigint",
  603. FLOAT: "float",
  604. STRINGS: "string",
  605. DATETIME: "datetime",
  606. BOOLEAN: "boolean",
  607. ARRAY: "array",
  608. STRUCT: "struct",
  609. }
  610. func (d DataType) isSimpleType() bool {
  611. return d >= BIGINT && d <= BOOLEAN
  612. }
  613. func (d DataType) String() string {
  614. if d >= 0 && d < DataType(len(dataTypes)) {
  615. return dataTypes[d]
  616. }
  617. return ""
  618. }
  619. func getDataType(tok Token) DataType {
  620. switch tok {
  621. case XBIGINT:
  622. return BIGINT
  623. case XFLOAT:
  624. return FLOAT
  625. case XSTRING:
  626. return STRINGS
  627. case XDATETIME:
  628. return DATETIME
  629. case XBOOLEAN:
  630. return BOOLEAN
  631. case XARRAY:
  632. return ARRAY
  633. case XSTRUCT:
  634. return STRUCT
  635. }
  636. return UNKNOWN
  637. }