lexical.go 11 KB


  1. package xsql
  2. import (
  3. "bufio"
  4. "bytes"
  5. "io"
  6. "strings"
  7. )
  8. type Token int
  9. const (
  10. // Special tokens
  11. ILLEGAL Token = iota
  12. EOF
  13. WS
  14. COMMENT
  15. AS
  16. // Literals
  17. IDENT // main
  18. INTEGER // 12345
  19. NUMBER //12345.67
  20. STRING // "abc"
  21. BADSTRING // "abc
  22. operatorBeg
  23. // ADD and the following are InfluxQL Operators
  24. ADD // +
  25. SUB // -
  26. MUL // *
  27. DIV // /
  28. MOD // %
  29. BITWISE_AND // &
  30. BITWISE_OR // |
  31. BITWISE_XOR // ^
  32. AND // AND
  33. OR // OR
  34. EQ // =
  35. NEQ // !=
  36. LT // <
  37. LTE // <=
  38. GT // >
  39. GTE // >=
  40. SUBSET //[
  41. ARROW //->
  42. operatorEnd
  43. // Misc characters
  44. ASTERISK // *
  45. COMMA // ,
  46. LPAREN // (
  47. RPAREN // )
  48. LBRACKET //[
  49. RBRACKET //]
  50. HASH // #
  51. DOT // .
  52. COLON //:
  53. SEMICOLON //;
  54. // Keywords
  55. SELECT
  56. FROM
  57. JOIN
  58. INNER
  59. LEFT
  60. RIGHT
  61. FULL
  62. CROSS
  63. ON
  64. WHERE
  65. GROUP
  66. ORDER
  67. HAVING
  68. BY
  69. ASC
  70. DESC
  71. TRUE
  72. FALSE
  73. CREATE
  74. DROP
  75. EXPLAIN
  76. DESCRIBE
  77. SHOW
  78. STREAM
  79. STREAMS
  80. WITH
  81. XBIGINT
  82. XFLOAT
  83. XSTRING
  84. XDATETIME
  85. XBOOLEAN
  86. XARRAY
  87. XSTRUCT
  88. DATASOURCE
  89. KEY
  90. FORMAT
  91. CONF_KEY
  92. TYPE
  93. STRICT_VALIDATION
  94. TIMESTAMP
  95. TIMESTAMP_FORMAT
  96. DD
  97. HH
  98. MI
  99. SS
  100. MS
  101. )
  102. var tokens = []string{
  103. ILLEGAL: "ILLEGAL",
  104. EOF: "EOF",
  105. AS: "AS",
  106. WS: "WS",
  107. IDENT: "IDENT",
  108. INTEGER: "INTEGER",
  109. NUMBER: "NUMBER",
  110. STRING: "STRING",
  111. ADD: "+",
  112. SUB: "-",
  113. MUL: "*",
  114. DIV: "/",
  115. MOD: "%",
  116. BITWISE_AND: "&",
  117. BITWISE_OR: "|",
  118. BITWISE_XOR: "^",
  119. EQ: "=",
  120. NEQ: "!=",
  121. LT: "<",
  122. LTE: "<=",
  123. GT: ">",
  124. GTE: ">=",
  125. SUBSET: "[]",
  126. ARROW: "->",
  127. ASTERISK: "*",
  128. COMMA: ",",
  129. LPAREN: "(",
  130. RPAREN: ")",
  131. LBRACKET: "[",
  132. RBRACKET: "]",
  133. HASH: "#",
  134. DOT: ".",
  135. SEMICOLON: ";",
  136. COLON: ":",
  137. SELECT: "SELECT",
  138. FROM: "FROM",
  139. JOIN: "JOIN",
  140. LEFT: "LEFT",
  141. INNER: "INNER",
  142. ON: "ON",
  143. WHERE: "WHERE",
  144. GROUP: "GROUP",
  145. ORDER: "ORDER",
  146. HAVING: "HAVING",
  147. BY: "BY",
  148. ASC: "ASC",
  149. DESC: "DESC",
  150. CREATE: "CREATE",
  151. DROP: "RROP",
  152. EXPLAIN: "EXPLAIN",
  153. DESCRIBE: "DESCRIBE",
  154. SHOW: "SHOW",
  155. STREAM: "STREAM",
  156. STREAMS: "STREAMS",
  157. WITH: "WITH",
  158. XBIGINT: "BIGINT",
  159. XFLOAT: "FLOAT",
  160. XSTRING: "STRING",
  161. XDATETIME: "DATETIME",
  162. XBOOLEAN: "BOOLEAN",
  163. XARRAY: "ARRAY",
  164. XSTRUCT: "STRUCT",
  165. DATASOURCE: "DATASOURCE",
  166. KEY: "KEY",
  167. FORMAT: "FORMAT",
  168. CONF_KEY: "CONF_KEY",
  169. TYPE: "TYPE",
  170. STRICT_VALIDATION: "STRICT_VALIDATION",
  171. TIMESTAMP: "TIMESTAMP",
  172. TIMESTAMP_FORMAT: "TIMESTAMP_FORMAT",
  173. AND: "AND",
  174. OR: "OR",
  175. TRUE: "TRUE",
  176. FALSE: "FALSE",
  177. DD: "DD",
  178. HH: "HH",
  179. MI: "MI",
  180. SS: "SS",
  181. MS: "MS",
  182. }
  183. func (tok Token) String() string {
  184. if tok >= 0 && tok < Token(len(tokens)) {
  185. return tokens[tok]
  186. }
  187. return ""
  188. }
  189. type Scanner struct {
  190. r *bufio.Reader
  191. }
  192. func NewScanner(r io.Reader) *Scanner {
  193. return &Scanner{r: bufio.NewReader(r)}
  194. }
  195. func (s *Scanner) Scan() (tok Token, lit string) {
  196. ch := s.read()
  197. if isWhiteSpace(ch) {
  198. //s.unread()
  199. return s.ScanWhiteSpace()
  200. } else if isLetter(ch) {
  201. s.unread()
  202. return s.ScanIdent()
  203. } else if isQuotation(ch) {
  204. s.unread()
  205. return s.ScanString()
  206. } else if isDigit(ch) {
  207. s.unread()
  208. return s.ScanNumber(false, false)
  209. }
  210. switch ch {
  211. case eof:
  212. return EOF, tokens[EOF]
  213. case '=':
  214. return EQ, tokens[EQ]
  215. case '!':
  216. _, _ = s.ScanWhiteSpace()
  217. if r := s.read(); r == '=' {
  218. return NEQ, tokens[NEQ]
  219. } else {
  220. s.unread()
  221. }
  222. return EQ, tokens[EQ]
  223. case '<':
  224. _, _ = s.ScanWhiteSpace()
  225. if r := s.read(); r == '=' {
  226. return LTE, tokens[LTE]
  227. } else {
  228. s.unread()
  229. }
  230. return LT, tokens[LT]
  231. case '>':
  232. _, _ = s.ScanWhiteSpace()
  233. if r := s.read(); r == '=' {
  234. return GTE, tokens[GTE]
  235. } else {
  236. s.unread()
  237. }
  238. return GT, tokens[GT]
  239. case '+':
  240. return ADD, tokens[ADD]
  241. case '-':
  242. _, _ = s.ScanWhiteSpace()
  243. if r := s.read(); r == '-' {
  244. s.skipUntilNewline()
  245. return COMMENT, ""
  246. } else if r == '>' {
  247. return ARROW, tokens[ARROW]
  248. } else if isDigit(r) {
  249. s.unread()
  250. return s.ScanNumber(false, true)
  251. } else if r == '.' {
  252. _, _ = s.ScanWhiteSpace()
  253. if r1 := s.read(); isDigit(r1) {
  254. s.unread()
  255. return s.ScanNumber(true, true)
  256. } else {
  257. s.unread()
  258. }
  259. s.unread()
  260. } else {
  261. s.unread()
  262. }
  263. return SUB, tokens[SUB]
  264. case '/':
  265. _, _ = s.ScanWhiteSpace()
  266. if r := s.read(); r == '*' {
  267. if err := s.skipUntilEndComment(); err != nil {
  268. return ILLEGAL, ""
  269. }
  270. return COMMENT, ""
  271. } else {
  272. s.unread()
  273. }
  274. return DIV, tokens[DIV]
  275. case '.':
  276. if r := s.read(); isDigit(r) {
  277. s.unread()
  278. return s.ScanNumber(true, false)
  279. }
  280. s.unread()
  281. return DOT, tokens[DOT]
  282. case '%':
  283. return MOD, tokens[MOD]
  284. case '&':
  285. return BITWISE_AND, tokens[BITWISE_AND]
  286. case '|':
  287. return BITWISE_OR, tokens[BITWISE_OR]
  288. case '^':
  289. return BITWISE_XOR, tokens[BITWISE_XOR]
  290. case '*':
  291. return ASTERISK, tokens[ASTERISK]
  292. case ',':
  293. return COMMA, tokens[COMMA]
  294. case '(':
  295. return LPAREN, tokens[LPAREN]
  296. case ')':
  297. return RPAREN, tokens[RPAREN]
  298. case '[':
  299. return LBRACKET, tokens[LBRACKET]
  300. case ']':
  301. return RBRACKET, tokens[RBRACKET]
  302. case ':':
  303. return COLON, tokens[COLON]
  304. case '#':
  305. return HASH, tokens[HASH]
  306. case ';':
  307. return SEMICOLON, tokens[SEMICOLON]
  308. }
  309. return ILLEGAL, ""
  310. }
  311. func (s *Scanner) ScanIdent() (tok Token, lit string) {
  312. var buf bytes.Buffer
  313. buf.WriteRune(s.read())
  314. for {
  315. if ch := s.read(); ch == eof {
  316. break
  317. } else if !isLetter(ch) && !isDigit(ch) && ch != '_' {
  318. s.unread()
  319. break
  320. } else {
  321. buf.WriteRune(ch)
  322. }
  323. }
  324. switch lit = strings.ToUpper(buf.String()); lit {
  325. case "SELECT":
  326. return SELECT, lit
  327. case "AS":
  328. return AS, lit
  329. case "FROM":
  330. return FROM, lit
  331. case "WHERE":
  332. return WHERE, lit
  333. case "AND":
  334. return AND, lit
  335. case "OR":
  336. return OR, lit
  337. case "GROUP":
  338. return GROUP, lit
  339. case "HAVING":
  340. return HAVING, lit
  341. case "ORDER":
  342. return ORDER, lit
  343. case "BY":
  344. return BY, lit
  345. case "DESC":
  346. return DESC, lit
  347. case "ASC":
  348. return ASC, lit
  349. case "INNER":
  350. return INNER, lit
  351. case "LEFT":
  352. return LEFT, lit
  353. case "RIGHT":
  354. return RIGHT, lit
  355. case "FULL":
  356. return FULL, lit
  357. case "CROSS":
  358. return CROSS, lit
  359. case "JOIN":
  360. return JOIN, lit
  361. case "ON":
  362. return ON, lit
  363. case "CREATE":
  364. return CREATE, lit
  365. case "DROP":
  366. return DROP, lit
  367. case "EXPLAIN":
  368. return EXPLAIN, lit
  369. case "DESCRIBE":
  370. return DESCRIBE, lit
  371. case "SHOW":
  372. return SHOW, lit
  373. case "STREAM":
  374. return STREAM, lit
  375. case "STREAMS":
  376. return STREAMS, lit
  377. case "WITH":
  378. return WITH, lit
  379. case "BIGINT":
  380. return XBIGINT, lit
  381. case "FLOAT":
  382. return XFLOAT, lit
  383. case "DATETIME":
  384. return XDATETIME, lit
  385. case "STRING":
  386. return XSTRING, lit
  387. case "BOOLEAN":
  388. return XBOOLEAN, lit
  389. case "ARRAY":
  390. return XARRAY, lit
  391. case "STRUCT":
  392. return XSTRUCT, lit
  393. case "DATASOURCE":
  394. return DATASOURCE, lit
  395. case "KEY":
  396. return KEY, lit
  397. case "FORMAT":
  398. return FORMAT, lit
  399. case "CONF_KEY":
  400. return CONF_KEY, lit
  401. case "TYPE":
  402. return TYPE, lit
  403. case "TRUE":
  404. return TRUE, lit
  405. case "FALSE":
  406. return FALSE, lit
  407. case "STRICT_VALIDATION":
  408. return STRICT_VALIDATION, lit
  409. case "TIMESTAMP":
  410. return TIMESTAMP, lit
  411. case "TIMESTAMP_FORMAT":
  412. return TIMESTAMP_FORMAT, lit
  413. case "DD":
  414. return DD, lit
  415. case "HH":
  416. return HH, lit
  417. case "MI":
  418. return MI, lit
  419. case "SS":
  420. return SS, lit
  421. case "MS":
  422. return MS, lit
  423. }
  424. return IDENT, buf.String()
  425. }
  426. func (s *Scanner) ScanString() (tok Token, lit string) {
  427. var buf bytes.Buffer
  428. _ = s.read()
  429. for {
  430. ch := s.read()
  431. if ch == '"' {
  432. break
  433. } else if ch == eof {
  434. return BADSTRING, buf.String()
  435. } else {
  436. buf.WriteRune(ch)
  437. }
  438. }
  439. return STRING, buf.String()
  440. }
  441. func (s *Scanner) ScanDigit() (tok Token, lit string) {
  442. var buf bytes.Buffer
  443. ch := s.read()
  444. buf.WriteRune(ch)
  445. for {
  446. if ch := s.read(); isDigit(ch) {
  447. buf.WriteRune(ch)
  448. } else {
  449. s.unread()
  450. break
  451. }
  452. }
  453. return INTEGER, buf.String()
  454. }
  455. func (s *Scanner) ScanNumber(startWithDot bool, isNeg bool) (tok Token, lit string) {
  456. var buf bytes.Buffer
  457. if isNeg {
  458. buf.WriteRune('-')
  459. }
  460. if startWithDot {
  461. buf.WriteRune('.')
  462. }
  463. ch := s.read()
  464. buf.WriteRune(ch)
  465. isNum := false
  466. for {
  467. if ch := s.read(); isDigit(ch) {
  468. buf.WriteRune(ch)
  469. } else if ch == '.' {
  470. isNum = true
  471. buf.WriteRune(ch)
  472. } else {
  473. s.unread()
  474. break
  475. }
  476. }
  477. if isNum || startWithDot {
  478. return NUMBER, buf.String()
  479. } else {
  480. return INTEGER, buf.String()
  481. }
  482. }
  483. func (s *Scanner) skipUntilNewline() {
  484. for {
  485. if ch := s.read(); ch == '\n' || ch == eof {
  486. return
  487. }
  488. }
  489. }
  490. func (s *Scanner) skipUntilEndComment() error {
  491. for {
  492. if ch1 := s.read(); ch1 == '*' {
  493. // We might be at the end.
  494. star:
  495. ch2 := s.read()
  496. if ch2 == '/' {
  497. return nil
  498. } else if ch2 == '*' {
  499. // We are back in the state machine since we see a star.
  500. goto star
  501. } else if ch2 == eof {
  502. return io.EOF
  503. }
  504. } else if ch1 == eof {
  505. return io.EOF
  506. }
  507. }
  508. }
  509. func (s *Scanner) ScanWhiteSpace() (tok Token, lit string) {
  510. var buf bytes.Buffer
  511. for {
  512. if ch := s.read(); ch == eof {
  513. break
  514. } else if !isWhiteSpace(ch) {
  515. s.unread()
  516. break
  517. } else {
  518. buf.WriteRune(ch)
  519. }
  520. }
  521. return WS, buf.String()
  522. }
  523. func (s *Scanner) read() rune {
  524. ch, _, err := s.r.ReadRune()
  525. if err != nil {
  526. return eof
  527. }
  528. return ch
  529. }
  530. func (s *Scanner) unread() {
  531. _ = s.r.UnreadRune()
  532. }
  533. var eof = rune(0)
  534. func isWhiteSpace(r rune) bool {
  535. return (r == ' ') || (r == '\t') || (r == '\r') || (r == '\n')
  536. }
  537. func isLetter(ch rune) bool { return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') }
  538. func isDigit(ch rune) bool { return ch >= '0' && ch <= '9' }
  539. func isQuotation(ch rune) bool { return ch == '"' }
  540. func (tok Token) isOperator() bool {
  541. return (tok > operatorBeg && tok < operatorEnd) || tok == ASTERISK || tok == LBRACKET
  542. }
  543. func (tok Token) isTimeLiteral() bool { return tok >= DD && tok <= MS }
  544. func (tok Token) allowedSourceToken() bool {
  545. return tok == IDENT || tok == DIV || tok == HASH || tok == ADD
  546. }
  547. //Allowed special field name token
  548. func (tok Token) allowedSFNToken() bool { return tok == DOT }
  549. func (tok Token) Precedence() int {
  550. switch tok {
  551. case OR:
  552. return 1
  553. case AND:
  554. return 2
  555. case EQ, NEQ, LT, LTE, GT, GTE:
  556. return 3
  557. case ADD, SUB, BITWISE_OR, BITWISE_XOR:
  558. return 4
  559. case MUL, DIV, MOD, BITWISE_AND, SUBSET, ARROW:
  560. return 5
  561. }
  562. return 0
  563. }
  564. type DataType int
  565. const (
  566. UNKNOWN DataType = iota
  567. BIGINT
  568. FLOAT
  569. STRINGS
  570. DATETIME
  571. BOOLEAN
  572. ARRAY
  573. STRUCT
  574. )
  575. var dataTypes = []string{
  576. BIGINT: "bigint",
  577. FLOAT: "float",
  578. STRINGS: "string",
  579. DATETIME: "datetime",
  580. BOOLEAN: "boolean",
  581. ARRAY: "array",
  582. STRUCT: "struct",
  583. }
  584. func (d DataType) isSimpleType() bool {
  585. return d >= BIGINT && d <= BOOLEAN
  586. }
  587. func (d DataType) String() string {
  588. if d >= 0 && d < DataType(len(dataTypes)) {
  589. return dataTypes[d]
  590. }
  591. return ""
  592. }
  593. func getDataType(tok Token) DataType {
  594. switch tok {
  595. case XBIGINT:
  596. return BIGINT
  597. case XFLOAT:
  598. return FLOAT
  599. case XSTRING:
  600. return STRINGS
  601. case XDATETIME:
  602. return DATETIME
  603. case XBOOLEAN:
  604. return BOOLEAN
  605. case XARRAY:
  606. return ARRAY
  607. case XSTRUCT:
  608. return STRUCT
  609. }
  610. return UNKNOWN
  611. }