lexical.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652
  1. package xsql
  2. import (
  3. "bufio"
  4. "bytes"
  5. "io"
  6. "strings"
  7. )
  8. type Token int
  9. const (
  10. // Special tokens
  11. ILLEGAL Token = iota
  12. EOF
  13. WS
  14. COMMENT
  15. AS
  16. // Literals
  17. IDENT // main
  18. INTEGER // 12345
  19. NUMBER //12345.67
  20. STRING // "abc"
  21. BADSTRING // "abc
  22. operatorBeg
  23. // ADD and the following are InfluxQL Operators
  24. ADD // +
  25. SUB // -
  26. MUL // *
  27. DIV // /
  28. MOD // %
  29. BITWISE_AND // &
  30. BITWISE_OR // |
  31. BITWISE_XOR // ^
  32. AND // AND
  33. OR // OR
  34. EQ // =
  35. NEQ // !=
  36. LT // <
  37. LTE // <=
  38. GT // >
  39. GTE // >=
  40. SUBSET //[
  41. ARROW //->
  42. operatorEnd
  43. // Misc characters
  44. ASTERISK // *
  45. COMMA // ,
  46. LPAREN // (
  47. RPAREN // )
  48. LBRACKET //[
  49. RBRACKET //]
  50. HASH // #
  51. DOT // .
  52. COLON //:
  53. SEMICOLON //;
  54. // Keywords
  55. SELECT
  56. FROM
  57. JOIN
  58. LEFT
  59. INNER
  60. ON
  61. WHERE
  62. GROUP
  63. ORDER
  64. BY
  65. ASC
  66. DESC
  67. TRUE
  68. FALSE
  69. CREATE
  70. DROP
  71. EXPLAIN
  72. DESCRIBE
  73. SHOW
  74. STREAM
  75. STREAMS
  76. WITH
  77. XBIGINT
  78. XFLOAT
  79. XSTRING
  80. XDATETIME
  81. XBOOLEAN
  82. XARRAY
  83. XSTRUCT
  84. DATASOURCE
  85. KEY
  86. FORMAT
  87. CONF_KEY
  88. TYPE
  89. STRICT_VALIDATION
  90. DD
  91. HH
  92. MI
  93. SS
  94. MS
  95. )
  96. var tokens = []string{
  97. ILLEGAL: "ILLEGAL",
  98. EOF: "EOF",
  99. AS: "AS",
  100. WS: "WS",
  101. IDENT: "IDENT",
  102. INTEGER: "INTEGER",
  103. NUMBER: "NUMBER",
  104. STRING: "STRING",
  105. ADD: "+",
  106. SUB: "-",
  107. MUL: "*",
  108. DIV: "/",
  109. MOD: "%",
  110. BITWISE_AND: "&",
  111. BITWISE_OR: "|",
  112. BITWISE_XOR: "^",
  113. EQ: "=",
  114. NEQ: "!=",
  115. LT: "<",
  116. LTE: "<=",
  117. GT: ">",
  118. GTE: ">=",
  119. ARROW: "->",
  120. ASTERISK: "*",
  121. COMMA: ",",
  122. LPAREN: "(",
  123. RPAREN: ")",
  124. LBRACKET: "[",
  125. RBRACKET: "]",
  126. HASH: "#",
  127. DOT: ".",
  128. SEMICOLON: ";",
  129. COLON: ":",
  130. SELECT: "SELECT",
  131. FROM: "FROM",
  132. JOIN: "JOIN",
  133. LEFT: "LEFT",
  134. INNER: "INNER",
  135. ON: "ON",
  136. WHERE: "WHERE",
  137. GROUP: "GROUP",
  138. ORDER: "ORDER",
  139. BY: "BY",
  140. ASC: "ASC",
  141. DESC: "DESC",
  142. CREATE: "CREATE",
  143. DROP: "RROP",
  144. EXPLAIN: "EXPLAIN",
  145. DESCRIBE: "DESCRIBE",
  146. SHOW: "SHOW",
  147. STREAM: "STREAM",
  148. STREAMS: "STREAMS",
  149. WITH: "WITH",
  150. XBIGINT: "BIGINT",
  151. XFLOAT: "FLOAT",
  152. XSTRING: "STRING",
  153. XDATETIME: "DATETIME",
  154. XBOOLEAN: "BOOLEAN",
  155. XARRAY: "ARRAY",
  156. XSTRUCT: "STRUCT",
  157. DATASOURCE: "DATASOURCE",
  158. KEY: "KEY",
  159. FORMAT: "FORMAT",
  160. CONF_KEY: "CONF_KEY",
  161. TYPE: "TYPE",
  162. STRICT_VALIDATION: "STRICT_VALIDATION",
  163. AND: "AND",
  164. OR: "OR",
  165. TRUE: "TRUE",
  166. FALSE: "FALSE",
  167. DD: "DD",
  168. HH: "HH",
  169. MI: "MI",
  170. SS: "SS",
  171. MS: "MS",
  172. }
  173. func (tok Token) String() string {
  174. if tok >= 0 && tok < Token(len(tokens)) {
  175. return tokens[tok]
  176. }
  177. return ""
  178. }
  179. type Scanner struct {
  180. r *bufio.Reader
  181. }
  182. func NewScanner(r io.Reader) *Scanner {
  183. return &Scanner{r: bufio.NewReader(r)}
  184. }
  185. func (s *Scanner) Scan() (tok Token, lit string) {
  186. ch := s.read()
  187. if isWhiteSpace(ch) {
  188. //s.unread()
  189. return s.ScanWhiteSpace()
  190. } else if isLetter(ch) {
  191. s.unread()
  192. return s.ScanIdent()
  193. } else if isQuotation(ch) {
  194. s.unread()
  195. return s.ScanString()
  196. } else if isDigit(ch) {
  197. s.unread()
  198. return s.ScanNumber(false, false)
  199. }
  200. switch ch {
  201. case eof:
  202. return EOF, tokens[EOF]
  203. case '=':
  204. return EQ, tokens[EQ]
  205. case '!':
  206. _, _ = s.ScanWhiteSpace()
  207. if r := s.read(); r == '=' {
  208. return NEQ, tokens[NEQ]
  209. } else {
  210. s.unread()
  211. }
  212. return EQ, tokens[EQ]
  213. case '<':
  214. _, _ = s.ScanWhiteSpace()
  215. if r := s.read(); r == '=' {
  216. return LTE, tokens[LTE]
  217. } else {
  218. s.unread()
  219. }
  220. return LT, tokens[LT]
  221. case '>':
  222. _, _ = s.ScanWhiteSpace()
  223. if r := s.read(); r == '=' {
  224. return GTE, tokens[GTE]
  225. } else {
  226. s.unread()
  227. }
  228. return GT, tokens[GT]
  229. case '+':
  230. return ADD, tokens[ADD]
  231. case '-':
  232. _, _ = s.ScanWhiteSpace()
  233. if r := s.read(); r == '-' {
  234. s.skipUntilNewline()
  235. return COMMENT, ""
  236. } else if (r == '>'){
  237. return ARROW, tokens[ARROW]
  238. } else if isDigit(r) {
  239. s.unread()
  240. return s.ScanNumber(false, true)
  241. } else if r == '.' {
  242. _, _ = s.ScanWhiteSpace()
  243. if r1 := s.read(); isDigit(r1) {
  244. s.unread()
  245. return s.ScanNumber(true, true)
  246. } else {
  247. s.unread()
  248. }
  249. s.unread()
  250. } else {
  251. s.unread()
  252. }
  253. return SUB, tokens[SUB]
  254. case '/':
  255. _, _ = s.ScanWhiteSpace()
  256. if r := s.read(); r == '*' {
  257. if err := s.skipUntilEndComment(); err != nil {
  258. return ILLEGAL, ""
  259. }
  260. return COMMENT, ""
  261. } else {
  262. s.unread()
  263. }
  264. return DIV, tokens[DIV]
  265. case '.':
  266. if r := s.read(); isDigit(r) {
  267. s.unread()
  268. return s.ScanNumber(true, false)
  269. }
  270. s.unread()
  271. return DOT, tokens[DOT]
  272. case '%':
  273. return MOD, tokens[MOD]
  274. case '&':
  275. return BITWISE_AND, tokens[BITWISE_AND]
  276. case '|':
  277. return BITWISE_OR, tokens[BITWISE_OR]
  278. case '^':
  279. return BITWISE_XOR, tokens[BITWISE_XOR]
  280. case '*':
  281. return ASTERISK, tokens[ASTERISK]
  282. case ',':
  283. return COMMA, tokens[COMMA]
  284. case '(':
  285. return LPAREN, tokens[LPAREN]
  286. case ')':
  287. return RPAREN, tokens[RPAREN]
  288. case '[':
  289. return LBRACKET, tokens[LBRACKET]
  290. case ']':
  291. return RBRACKET, tokens[RBRACKET]
  292. case ':':
  293. return COLON, tokens[COLON]
  294. case '#':
  295. return HASH, tokens[HASH]
  296. case ';':
  297. return SEMICOLON, tokens[SEMICOLON]
  298. }
  299. return ILLEGAL, ""
  300. }
  301. func (s *Scanner) ScanIdent() (tok Token, lit string) {
  302. var buf bytes.Buffer
  303. buf.WriteRune(s.read())
  304. for {
  305. if ch := s.read(); ch == eof {
  306. break
  307. } else if !isLetter(ch) && !isDigit(ch) && ch != '_' {
  308. s.unread()
  309. break
  310. } else {
  311. buf.WriteRune(ch)
  312. }
  313. }
  314. switch lit = strings.ToUpper(buf.String()); lit {
  315. case "SELECT":
  316. return SELECT, lit
  317. case "AS":
  318. return AS, lit
  319. case "FROM":
  320. return FROM, lit
  321. case "WHERE":
  322. return WHERE, lit
  323. case "AND":
  324. return AND, lit
  325. case "OR":
  326. return OR, lit
  327. case "GROUP":
  328. return GROUP, lit
  329. case "ORDER":
  330. return ORDER, lit
  331. case "BY":
  332. return BY, lit
  333. case "DESC":
  334. return DESC, lit
  335. case "ASC":
  336. return ASC, lit
  337. case "INNER":
  338. return INNER, lit
  339. case "LEFT":
  340. return LEFT, lit
  341. case "JOIN":
  342. return JOIN, lit
  343. case "ON":
  344. return ON, lit
  345. case "CREATE":
  346. return CREATE, lit
  347. case "DROP":
  348. return DROP, lit
  349. case "EXPLAIN":
  350. return EXPLAIN, lit
  351. case "DESCRIBE":
  352. return DESCRIBE, lit
  353. case "SHOW":
  354. return SHOW, lit
  355. case "STREAM":
  356. return STREAM, lit
  357. case "STREAMS":
  358. return STREAMS, lit
  359. case "WITH":
  360. return WITH, lit
  361. case "BIGINT":
  362. return XBIGINT, lit
  363. case "FLOAT":
  364. return XFLOAT, lit
  365. case "DATETIME":
  366. return XDATETIME, lit
  367. case "STRING":
  368. return XSTRING, lit
  369. case "BOOLEAN":
  370. return XBOOLEAN, lit
  371. case "ARRAY":
  372. return XARRAY, lit
  373. case "STRUCT":
  374. return XSTRUCT, lit
  375. case "DATASOURCE":
  376. return DATASOURCE, lit
  377. case "KEY":
  378. return KEY, lit
  379. case "FORMAT":
  380. return FORMAT, lit
  381. case "CONF_KEY":
  382. return CONF_KEY, lit
  383. case "TYPE":
  384. return TYPE, lit
  385. case "TRUE":
  386. return TRUE, lit
  387. case "FALSE":
  388. return FALSE, lit
  389. case "STRICT_VALIDATION":
  390. return STRICT_VALIDATION, lit
  391. case "DD":
  392. return DD, lit
  393. case "HH":
  394. return HH, lit
  395. case "MI":
  396. return MI, lit
  397. case "SS":
  398. return SS, lit
  399. case "MS":
  400. return MS, lit
  401. }
  402. return IDENT, buf.String()
  403. }
  404. func (s *Scanner) ScanString() (tok Token, lit string) {
  405. var buf bytes.Buffer
  406. _ = s.read()
  407. for {
  408. ch := s.read()
  409. if ch == '"' {
  410. break
  411. } else if ch == eof {
  412. return BADSTRING, buf.String()
  413. } else {
  414. buf.WriteRune(ch)
  415. }
  416. }
  417. return STRING, buf.String()
  418. }
  419. func (s *Scanner) ScanDigit() (tok Token, lit string) {
  420. var buf bytes.Buffer
  421. ch := s.read()
  422. buf.WriteRune(ch)
  423. for {
  424. if ch := s.read(); isDigit(ch) {
  425. buf.WriteRune(ch)
  426. } else {
  427. s.unread()
  428. break
  429. }
  430. }
  431. return INTEGER, buf.String()
  432. }
  433. func (s *Scanner) ScanNumber(startWithDot bool, isNeg bool) (tok Token, lit string) {
  434. var buf bytes.Buffer
  435. if isNeg {
  436. buf.WriteRune('-')
  437. }
  438. if startWithDot {
  439. buf.WriteRune('.')
  440. }
  441. ch := s.read()
  442. buf.WriteRune(ch)
  443. isNum := false
  444. for {
  445. if ch := s.read(); isDigit(ch) {
  446. buf.WriteRune(ch)
  447. } else if ch == '.' {
  448. isNum = true
  449. buf.WriteRune(ch)
  450. } else {
  451. s.unread()
  452. break
  453. }
  454. }
  455. if isNum || startWithDot {
  456. return NUMBER, buf.String()
  457. } else {
  458. return INTEGER, buf.String()
  459. }
  460. }
  461. func (s *Scanner) skipUntilNewline() {
  462. for {
  463. if ch := s.read(); ch == '\n' || ch == eof {
  464. return
  465. }
  466. }
  467. }
  468. func (s *Scanner) skipUntilEndComment() error {
  469. for {
  470. if ch1 := s.read(); ch1 == '*' {
  471. // We might be at the end.
  472. star:
  473. ch2 := s.read()
  474. if ch2 == '/' {
  475. return nil
  476. } else if ch2 == '*' {
  477. // We are back in the state machine since we see a star.
  478. goto star
  479. } else if ch2 == eof {
  480. return io.EOF
  481. }
  482. } else if ch1 == eof {
  483. return io.EOF
  484. }
  485. }
  486. }
  487. func (s *Scanner) ScanWhiteSpace() (tok Token, lit string) {
  488. var buf bytes.Buffer
  489. for {
  490. if ch := s.read(); ch == eof {
  491. break
  492. } else if !isWhiteSpace(ch) {
  493. s.unread()
  494. break
  495. } else {
  496. buf.WriteRune(ch)
  497. }
  498. }
  499. return WS, buf.String()
  500. }
  501. func (s *Scanner) read() rune {
  502. ch, _, err := s.r.ReadRune()
  503. if err != nil {
  504. return eof
  505. }
  506. return ch
  507. }
  508. func (s *Scanner) unread() {
  509. _ = s.r.UnreadRune()
  510. }
  511. var eof = rune(0)
  512. func isWhiteSpace(r rune) bool {
  513. return (r == ' ') || (r == '\t') || (r == '\r') || (r == '\n')
  514. }
  515. func isLetter(ch rune) bool { return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') }
  516. func isDigit(ch rune) bool { return (ch >= '0' && ch <= '9') }
  517. func isQuotation(ch rune) bool { return ch == '"' }
  518. func (tok Token) isOperator() bool { return (tok > operatorBeg && tok < operatorEnd) || tok == ASTERISK || tok == LBRACKET }
  519. func (tok Token) isTimeLiteral() bool { return (tok >= DD && tok <= MS) }
  520. func (tok Token) allowedSourceToken() bool {
  521. return (tok == IDENT || tok == DIV || tok == HASH || tok == ADD)
  522. }
  523. //Allowed special field name token
  524. func (tok Token) allowedSFNToken() bool { return (tok == DOT) }
  525. func (tok Token) Precedence() int {
  526. switch tok {
  527. case OR:
  528. return 1
  529. case AND:
  530. return 2
  531. case EQ, NEQ, LT, LTE, GT, GTE:
  532. return 3
  533. case ADD, SUB, BITWISE_OR, BITWISE_XOR:
  534. return 4
  535. case MUL, DIV, MOD, BITWISE_AND, SUBSET, ARROW:
  536. return 5
  537. }
  538. return 0
  539. }
  540. type DataType int
  541. const (
  542. UNKNOWN DataType = iota
  543. BIGINT
  544. FLOAT
  545. STRINGS
  546. DATETIME
  547. BOOLEAN
  548. ARRAY
  549. STRUCT
  550. )
  551. var dataTypes = []string{
  552. BIGINT : "bigint",
  553. FLOAT : "float",
  554. STRINGS : "string",
  555. DATETIME: "datetime",
  556. BOOLEAN : "boolean",
  557. ARRAY : "array",
  558. STRUCT : "struct",
  559. }
  560. func (d DataType) isSimpleType() bool {
  561. return d >= BIGINT && d <= BOOLEAN
  562. }
  563. func (d DataType) String() string {
  564. if d >= 0 && d < DataType(len(dataTypes)) {
  565. return dataTypes[d]
  566. }
  567. return ""
  568. }
  569. func getDataType(tok Token) DataType {
  570. switch tok {
  571. case XBIGINT:
  572. return BIGINT
  573. case XFLOAT:
  574. return FLOAT
  575. case XSTRING:
  576. return STRINGS
  577. case XDATETIME:
  578. return DATETIME
  579. case XBOOLEAN:
  580. return BOOLEAN
  581. case XARRAY:
  582. return ARRAY
  583. case XSTRUCT:
  584. return STRUCT
  585. }
  586. return UNKNOWN
  587. }