lexical.go 11 KB


  1. package xsql
  2. import (
  3. "bufio"
  4. "bytes"
  5. "io"
  6. "strings"
  7. )
  8. type Token int
  9. const (
  10. // Special tokens
  11. ILLEGAL Token = iota
  12. EOF
  13. WS
  14. COMMENT
  15. AS
  16. // Literals
  17. IDENT // main
  18. INTEGER // 12345
  19. NUMBER //12345.67
  20. STRING // "abc"
  21. BADSTRING // "abc
  22. operatorBeg
  23. // ADD and the following are InfluxQL Operators
  24. ADD // +
  25. SUB // -
  26. MUL // *
  27. DIV // /
  28. MOD // %
  29. BITWISE_AND // &
  30. BITWISE_OR // |
  31. BITWISE_XOR // ^
  32. AND // AND
  33. OR // OR
  34. EQ // =
  35. NEQ // !=
  36. LT // <
  37. LTE // <=
  38. GT // >
  39. GTE // >=
  40. SUBSET //[
  41. ARROW //->
  42. operatorEnd
  43. // Misc characters
  44. ASTERISK // *
  45. COMMA // ,
  46. LPAREN // (
  47. RPAREN // )
  48. LBRACKET //[
  49. RBRACKET //]
  50. HASH // #
  51. DOT // .
  52. COLON //:
  53. SEMICOLON //;
  54. // Keywords
  55. SELECT
  56. FROM
  57. JOIN
  58. INNER
  59. LEFT
  60. RIGHT
  61. FULL
  62. CROSS
  63. ON
  64. WHERE
  65. GROUP
  66. ORDER
  67. HAVING
  68. BY
  69. ASC
  70. DESC
  71. TRUE
  72. FALSE
  73. CREATE
  74. DROP
  75. EXPLAIN
  76. DESCRIBE
  77. SHOW
  78. STREAM
  79. STREAMS
  80. WITH
  81. XBIGINT
  82. XFLOAT
  83. XSTRING
  84. XDATETIME
  85. XBOOLEAN
  86. XARRAY
  87. XSTRUCT
  88. DATASOURCE
  89. KEY
  90. FORMAT
  91. CONF_KEY
  92. TYPE
  93. STRICT_VALIDATION
  94. TIMESTAMP
  95. TIMESTAMP_FORMAT
  96. DD
  97. HH
  98. MI
  99. SS
  100. MS
  101. )
  102. var tokens = []string{
  103. ILLEGAL: "ILLEGAL",
  104. EOF: "EOF",
  105. AS: "AS",
  106. WS: "WS",
  107. IDENT: "IDENT",
  108. INTEGER: "INTEGER",
  109. NUMBER: "NUMBER",
  110. STRING: "STRING",
  111. ADD: "+",
  112. SUB: "-",
  113. MUL: "*",
  114. DIV: "/",
  115. MOD: "%",
  116. BITWISE_AND: "&",
  117. BITWISE_OR: "|",
  118. BITWISE_XOR: "^",
  119. EQ: "=",
  120. NEQ: "!=",
  121. LT: "<",
  122. LTE: "<=",
  123. GT: ">",
  124. GTE: ">=",
  125. ARROW: "->",
  126. ASTERISK: "*",
  127. COMMA: ",",
  128. LPAREN: "(",
  129. RPAREN: ")",
  130. LBRACKET: "[",
  131. RBRACKET: "]",
  132. HASH: "#",
  133. DOT: ".",
  134. SEMICOLON: ";",
  135. COLON: ":",
  136. SELECT: "SELECT",
  137. FROM: "FROM",
  138. JOIN: "JOIN",
  139. LEFT: "LEFT",
  140. INNER: "INNER",
  141. ON: "ON",
  142. WHERE: "WHERE",
  143. GROUP: "GROUP",
  144. ORDER: "ORDER",
  145. HAVING: "HAVING",
  146. BY: "BY",
  147. ASC: "ASC",
  148. DESC: "DESC",
  149. CREATE: "CREATE",
  150. DROP: "RROP",
  151. EXPLAIN: "EXPLAIN",
  152. DESCRIBE: "DESCRIBE",
  153. SHOW: "SHOW",
  154. STREAM: "STREAM",
  155. STREAMS: "STREAMS",
  156. WITH: "WITH",
  157. XBIGINT: "BIGINT",
  158. XFLOAT: "FLOAT",
  159. XSTRING: "STRING",
  160. XDATETIME: "DATETIME",
  161. XBOOLEAN: "BOOLEAN",
  162. XARRAY: "ARRAY",
  163. XSTRUCT: "STRUCT",
  164. DATASOURCE: "DATASOURCE",
  165. KEY: "KEY",
  166. FORMAT: "FORMAT",
  167. CONF_KEY: "CONF_KEY",
  168. TYPE: "TYPE",
  169. STRICT_VALIDATION: "STRICT_VALIDATION",
  170. TIMESTAMP: "TIMESTAMP",
  171. TIMESTAMP_FORMAT: "TIMESTAMP_FORMAT",
  172. AND: "AND",
  173. OR: "OR",
  174. TRUE: "TRUE",
  175. FALSE: "FALSE",
  176. DD: "DD",
  177. HH: "HH",
  178. MI: "MI",
  179. SS: "SS",
  180. MS: "MS",
  181. }
  182. func (tok Token) String() string {
  183. if tok >= 0 && tok < Token(len(tokens)) {
  184. return tokens[tok]
  185. }
  186. return ""
  187. }
  188. type Scanner struct {
  189. r *bufio.Reader
  190. }
  191. func NewScanner(r io.Reader) *Scanner {
  192. return &Scanner{r: bufio.NewReader(r)}
  193. }
  194. func (s *Scanner) Scan() (tok Token, lit string) {
  195. ch := s.read()
  196. if isWhiteSpace(ch) {
  197. //s.unread()
  198. return s.ScanWhiteSpace()
  199. } else if isLetter(ch) {
  200. s.unread()
  201. return s.ScanIdent()
  202. } else if isQuotation(ch) {
  203. s.unread()
  204. return s.ScanString()
  205. } else if isDigit(ch) {
  206. s.unread()
  207. return s.ScanNumber(false, false)
  208. }
  209. switch ch {
  210. case eof:
  211. return EOF, tokens[EOF]
  212. case '=':
  213. return EQ, tokens[EQ]
  214. case '!':
  215. _, _ = s.ScanWhiteSpace()
  216. if r := s.read(); r == '=' {
  217. return NEQ, tokens[NEQ]
  218. } else {
  219. s.unread()
  220. }
  221. return EQ, tokens[EQ]
  222. case '<':
  223. _, _ = s.ScanWhiteSpace()
  224. if r := s.read(); r == '=' {
  225. return LTE, tokens[LTE]
  226. } else {
  227. s.unread()
  228. }
  229. return LT, tokens[LT]
  230. case '>':
  231. _, _ = s.ScanWhiteSpace()
  232. if r := s.read(); r == '=' {
  233. return GTE, tokens[GTE]
  234. } else {
  235. s.unread()
  236. }
  237. return GT, tokens[GT]
  238. case '+':
  239. return ADD, tokens[ADD]
  240. case '-':
  241. _, _ = s.ScanWhiteSpace()
  242. if r := s.read(); r == '-' {
  243. s.skipUntilNewline()
  244. return COMMENT, ""
  245. } else if (r == '>'){
  246. return ARROW, tokens[ARROW]
  247. } else if isDigit(r) {
  248. s.unread()
  249. return s.ScanNumber(false, true)
  250. } else if r == '.' {
  251. _, _ = s.ScanWhiteSpace()
  252. if r1 := s.read(); isDigit(r1) {
  253. s.unread()
  254. return s.ScanNumber(true, true)
  255. } else {
  256. s.unread()
  257. }
  258. s.unread()
  259. } else {
  260. s.unread()
  261. }
  262. return SUB, tokens[SUB]
  263. case '/':
  264. _, _ = s.ScanWhiteSpace()
  265. if r := s.read(); r == '*' {
  266. if err := s.skipUntilEndComment(); err != nil {
  267. return ILLEGAL, ""
  268. }
  269. return COMMENT, ""
  270. } else {
  271. s.unread()
  272. }
  273. return DIV, tokens[DIV]
  274. case '.':
  275. if r := s.read(); isDigit(r) {
  276. s.unread()
  277. return s.ScanNumber(true, false)
  278. }
  279. s.unread()
  280. return DOT, tokens[DOT]
  281. case '%':
  282. return MOD, tokens[MOD]
  283. case '&':
  284. return BITWISE_AND, tokens[BITWISE_AND]
  285. case '|':
  286. return BITWISE_OR, tokens[BITWISE_OR]
  287. case '^':
  288. return BITWISE_XOR, tokens[BITWISE_XOR]
  289. case '*':
  290. return ASTERISK, tokens[ASTERISK]
  291. case ',':
  292. return COMMA, tokens[COMMA]
  293. case '(':
  294. return LPAREN, tokens[LPAREN]
  295. case ')':
  296. return RPAREN, tokens[RPAREN]
  297. case '[':
  298. return LBRACKET, tokens[LBRACKET]
  299. case ']':
  300. return RBRACKET, tokens[RBRACKET]
  301. case ':':
  302. return COLON, tokens[COLON]
  303. case '#':
  304. return HASH, tokens[HASH]
  305. case ';':
  306. return SEMICOLON, tokens[SEMICOLON]
  307. }
  308. return ILLEGAL, ""
  309. }
  310. func (s *Scanner) ScanIdent() (tok Token, lit string) {
  311. var buf bytes.Buffer
  312. buf.WriteRune(s.read())
  313. for {
  314. if ch := s.read(); ch == eof {
  315. break
  316. } else if !isLetter(ch) && !isDigit(ch) && ch != '_' {
  317. s.unread()
  318. break
  319. } else {
  320. buf.WriteRune(ch)
  321. }
  322. }
  323. switch lit = strings.ToUpper(buf.String()); lit {
  324. case "SELECT":
  325. return SELECT, lit
  326. case "AS":
  327. return AS, lit
  328. case "FROM":
  329. return FROM, lit
  330. case "WHERE":
  331. return WHERE, lit
  332. case "AND":
  333. return AND, lit
  334. case "OR":
  335. return OR, lit
  336. case "GROUP":
  337. return GROUP, lit
  338. case "HAVING":
  339. return HAVING, lit
  340. case "ORDER":
  341. return ORDER, lit
  342. case "BY":
  343. return BY, lit
  344. case "DESC":
  345. return DESC, lit
  346. case "ASC":
  347. return ASC, lit
  348. case "INNER":
  349. return INNER, lit
  350. case "LEFT":
  351. return LEFT, lit
  352. case "RIGHT":
  353. return RIGHT, lit
  354. case "FULL":
  355. return FULL, lit
  356. case "CROSS":
  357. return CROSS, lit
  358. case "JOIN":
  359. return JOIN, lit
  360. case "ON":
  361. return ON, lit
  362. case "CREATE":
  363. return CREATE, lit
  364. case "DROP":
  365. return DROP, lit
  366. case "EXPLAIN":
  367. return EXPLAIN, lit
  368. case "DESCRIBE":
  369. return DESCRIBE, lit
  370. case "SHOW":
  371. return SHOW, lit
  372. case "STREAM":
  373. return STREAM, lit
  374. case "STREAMS":
  375. return STREAMS, lit
  376. case "WITH":
  377. return WITH, lit
  378. case "BIGINT":
  379. return XBIGINT, lit
  380. case "FLOAT":
  381. return XFLOAT, lit
  382. case "DATETIME":
  383. return XDATETIME, lit
  384. case "STRING":
  385. return XSTRING, lit
  386. case "BOOLEAN":
  387. return XBOOLEAN, lit
  388. case "ARRAY":
  389. return XARRAY, lit
  390. case "STRUCT":
  391. return XSTRUCT, lit
  392. case "DATASOURCE":
  393. return DATASOURCE, lit
  394. case "KEY":
  395. return KEY, lit
  396. case "FORMAT":
  397. return FORMAT, lit
  398. case "CONF_KEY":
  399. return CONF_KEY, lit
  400. case "TYPE":
  401. return TYPE, lit
  402. case "TRUE":
  403. return TRUE, lit
  404. case "FALSE":
  405. return FALSE, lit
  406. case "STRICT_VALIDATION":
  407. return STRICT_VALIDATION, lit
  408. case "TIMESTAMP":
  409. return TIMESTAMP, lit
  410. case "TIMESTAMP_FORMAT":
  411. return TIMESTAMP_FORMAT, lit
  412. case "DD":
  413. return DD, lit
  414. case "HH":
  415. return HH, lit
  416. case "MI":
  417. return MI, lit
  418. case "SS":
  419. return SS, lit
  420. case "MS":
  421. return MS, lit
  422. }
  423. return IDENT, buf.String()
  424. }
  425. func (s *Scanner) ScanString() (tok Token, lit string) {
  426. var buf bytes.Buffer
  427. _ = s.read()
  428. for {
  429. ch := s.read()
  430. if ch == '"' {
  431. break
  432. } else if ch == eof {
  433. return BADSTRING, buf.String()
  434. } else {
  435. buf.WriteRune(ch)
  436. }
  437. }
  438. return STRING, buf.String()
  439. }
  440. func (s *Scanner) ScanDigit() (tok Token, lit string) {
  441. var buf bytes.Buffer
  442. ch := s.read()
  443. buf.WriteRune(ch)
  444. for {
  445. if ch := s.read(); isDigit(ch) {
  446. buf.WriteRune(ch)
  447. } else {
  448. s.unread()
  449. break
  450. }
  451. }
  452. return INTEGER, buf.String()
  453. }
  454. func (s *Scanner) ScanNumber(startWithDot bool, isNeg bool) (tok Token, lit string) {
  455. var buf bytes.Buffer
  456. if isNeg {
  457. buf.WriteRune('-')
  458. }
  459. if startWithDot {
  460. buf.WriteRune('.')
  461. }
  462. ch := s.read()
  463. buf.WriteRune(ch)
  464. isNum := false
  465. for {
  466. if ch := s.read(); isDigit(ch) {
  467. buf.WriteRune(ch)
  468. } else if ch == '.' {
  469. isNum = true
  470. buf.WriteRune(ch)
  471. } else {
  472. s.unread()
  473. break
  474. }
  475. }
  476. if isNum || startWithDot {
  477. return NUMBER, buf.String()
  478. } else {
  479. return INTEGER, buf.String()
  480. }
  481. }
  482. func (s *Scanner) skipUntilNewline() {
  483. for {
  484. if ch := s.read(); ch == '\n' || ch == eof {
  485. return
  486. }
  487. }
  488. }
  489. func (s *Scanner) skipUntilEndComment() error {
  490. for {
  491. if ch1 := s.read(); ch1 == '*' {
  492. // We might be at the end.
  493. star:
  494. ch2 := s.read()
  495. if ch2 == '/' {
  496. return nil
  497. } else if ch2 == '*' {
  498. // We are back in the state machine since we see a star.
  499. goto star
  500. } else if ch2 == eof {
  501. return io.EOF
  502. }
  503. } else if ch1 == eof {
  504. return io.EOF
  505. }
  506. }
  507. }
  508. func (s *Scanner) ScanWhiteSpace() (tok Token, lit string) {
  509. var buf bytes.Buffer
  510. for {
  511. if ch := s.read(); ch == eof {
  512. break
  513. } else if !isWhiteSpace(ch) {
  514. s.unread()
  515. break
  516. } else {
  517. buf.WriteRune(ch)
  518. }
  519. }
  520. return WS, buf.String()
  521. }
  522. func (s *Scanner) read() rune {
  523. ch, _, err := s.r.ReadRune()
  524. if err != nil {
  525. return eof
  526. }
  527. return ch
  528. }
  529. func (s *Scanner) unread() {
  530. _ = s.r.UnreadRune()
  531. }
  532. var eof = rune(0)
  533. func isWhiteSpace(r rune) bool {
  534. return (r == ' ') || (r == '\t') || (r == '\r') || (r == '\n')
  535. }
  536. func isLetter(ch rune) bool { return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') }
  537. func isDigit(ch rune) bool { return (ch >= '0' && ch <= '9') }
  538. func isQuotation(ch rune) bool { return ch == '"' }
  539. func (tok Token) isOperator() bool { return (tok > operatorBeg && tok < operatorEnd) || tok == ASTERISK || tok == LBRACKET }
  540. func (tok Token) isTimeLiteral() bool { return (tok >= DD && tok <= MS) }
  541. func (tok Token) allowedSourceToken() bool {
  542. return (tok == IDENT || tok == DIV || tok == HASH || tok == ADD)
  543. }
  544. //Allowed special field name token
  545. func (tok Token) allowedSFNToken() bool { return (tok == DOT) }
  546. func (tok Token) Precedence() int {
  547. switch tok {
  548. case OR:
  549. return 1
  550. case AND:
  551. return 2
  552. case EQ, NEQ, LT, LTE, GT, GTE:
  553. return 3
  554. case ADD, SUB, BITWISE_OR, BITWISE_XOR:
  555. return 4
  556. case MUL, DIV, MOD, BITWISE_AND, SUBSET, ARROW:
  557. return 5
  558. }
  559. return 0
  560. }
  561. type DataType int
  562. const (
  563. UNKNOWN DataType = iota
  564. BIGINT
  565. FLOAT
  566. STRINGS
  567. DATETIME
  568. BOOLEAN
  569. ARRAY
  570. STRUCT
  571. )
  572. var dataTypes = []string{
  573. BIGINT : "bigint",
  574. FLOAT : "float",
  575. STRINGS : "string",
  576. DATETIME: "datetime",
  577. BOOLEAN : "boolean",
  578. ARRAY : "array",
  579. STRUCT : "struct",
  580. }
  581. func (d DataType) isSimpleType() bool {
  582. return d >= BIGINT && d <= BOOLEAN
  583. }
  584. func (d DataType) String() string {
  585. if d >= 0 && d < DataType(len(dataTypes)) {
  586. return dataTypes[d]
  587. }
  588. return ""
  589. }
  590. func getDataType(tok Token) DataType {
  591. switch tok {
  592. case XBIGINT:
  593. return BIGINT
  594. case XFLOAT:
  595. return FLOAT
  596. case XSTRING:
  597. return STRINGS
  598. case XDATETIME:
  599. return DATETIME
  600. case XBOOLEAN:
  601. return BOOLEAN
  602. case XARRAY:
  603. return ARRAY
  604. case XSTRUCT:
  605. return STRUCT
  606. }
  607. return UNKNOWN
  608. }