From df1665313181c51bb31e0308313cc8c8272e6ac9 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Thu, 21 Sep 2023 15:01:43 +0300 Subject: [PATCH 1/2] - refactor source parser to allow nested parsing - replace logos with custom regex parser in statement parser --- Cargo.lock | 55 +++--- crates/codegen/Cargo.toml | 2 + crates/codegen/src/syntax_kind.rs | 1 - crates/parser/Cargo.toml | 1 + crates/parser/src/lib.rs | 4 +- crates/parser/src/source_file.rs | 107 ----------- crates/parser/src/source_parser.rs | 180 ++++++++++++++++++ .../src/{statement.rs => statement_parser.rs} | 4 - crates/postgres_lsp/src/main.rs | 2 +- 9 files changed, 215 insertions(+), 141 deletions(-) delete mode 100644 crates/parser/src/source_file.rs create mode 100644 crates/parser/src/source_parser.rs rename crates/parser/src/{statement.rs => statement_parser.rs} (99%) diff --git a/Cargo.lock b/Cargo.lock index 9eed1789..7c6686c6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -40,7 +40,7 @@ checksum = "bc00ceb34980c03614e35a3a4e218276a0a824e911d07651cd0d858a51e8c0f0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.35", ] [[package]] @@ -112,7 +112,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.32", + "syn 2.0.35", "which", ] @@ -191,6 +191,8 @@ dependencies = [ "pg_query_proto_parser", "proc-macro2", "quote", + "regex", + "syn 2.0.35", ] [[package]] @@ -255,7 +257,7 @@ checksum = "0da6c38bede0ecec78757fe92451c463ac9e6d37961c2cbce6a20be917951baf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.35", ] [[package]] @@ -416,7 +418,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.35", ] [[package]] @@ -649,7 +651,7 @@ dependencies = [ "proc-macro2", "quote", "regex-syntax 0.6.29", - "syn 2.0.32", + "syn 2.0.35", ] [[package]] @@ -676,9 +678,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.5.0" +version = "2.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" +checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c" [[package]] name = "minimal-lexical" @@ -778,6 +780,7 @@ dependencies = [ "cstree", "env_logger", "insta", + "lazy_static", "log", "logos", "pg_query", @@ -851,7 +854,7 @@ checksum = "4359fd9c9171ec6e8c62926d6faaf553a8dc3f64e1507e76da7911b4f6a04405" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.35", ] [[package]] @@ -888,7 +891,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ae005bd773ab59b4725093fd7df83fd7892f7d8eafb48dbd7de6e024e4215f9d" dependencies = [ "proc-macro2", - "syn 2.0.32", + "syn 2.0.35", ] [[package]] @@ -917,9 +920,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.66" +version = "1.0.67" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9" +checksum = "3d433d9f1a3e8c1263d9456598b16fec66f4acc9a74dacffd35c7bb09b3a1328" dependencies = [ "unicode-ident", ] @@ -1035,25 +1038,25 @@ dependencies = [ [[package]] name = "regex" -version = "1.9.3" +version = "1.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81bc1d4caf89fac26a70747fe603c130093b53c773888797a6329091246d651a" +checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47" dependencies = [ "aho-corasick", "memchr", "regex-automata", - "regex-syntax 0.7.4", + "regex-syntax 0.7.5", ] [[package]] name = "regex-automata" -version = "0.3.6" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fed1ceff11a1dddaee50c9dc8e4938bd106e9d89ae372f192311e7da498e3b69" +checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.7.4", + "regex-syntax 0.7.5", ] [[package]] @@ -1064,9 +1067,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" [[package]] name = "regex-syntax" -version = "0.7.4" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2" +checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" [[package]] name = "ropey" @@ -1132,7 +1135,7 @@ checksum = "dc59dfdcbad1437773485e0367fea4b090a2e0a16d9ffc46af47764536a298ec" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.35", ] [[package]] @@ -1154,7 +1157,7 @@ checksum = "8725e1dfadb3a50f7e5ce0b1a540466f6ed3fe7a0fca2ac2b8b831d31316bd00" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.35", ] [[package]] @@ -1234,9 +1237,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.32" +version = "2.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "239814284fd6f1a4ffe4ca893952cdd93c224b6a1571c9a9eadd670295c0c9e2" +checksum = "59bf04c28bee9043ed9ea1e41afc0552288d3aba9c6efdd78903b802926f4879" dependencies = [ "proc-macro2", "quote", @@ -1288,7 +1291,7 @@ checksum = "6bb623b56e39ab7dcd4b1b98bb6c8f8d907ed255b18de254088016b27a8ee19b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.35", ] [[package]] @@ -1350,7 +1353,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.35", ] [[package]] @@ -1447,7 +1450,7 @@ checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab" dependencies = [ "proc-macro2", "quote", - "syn 2.0.32", + "syn 2.0.35", ] [[package]] diff --git a/crates/codegen/Cargo.toml b/crates/codegen/Cargo.toml index 86b06851..80d0b3d6 100644 --- a/crates/codegen/Cargo.toml +++ b/crates/codegen/Cargo.toml @@ -9,6 +9,8 @@ edition = "2021" proc-macro2 = "1.0.66" quote = "1.0.33" pg_query_proto_parser.workspace = true +syn = { version = "2.0.35", features = ["full"] } +regex = "1.9.5" [lib] proc-macro = true diff --git a/crates/codegen/src/syntax_kind.rs b/crates/codegen/src/syntax_kind.rs index 652cb16b..b59ab9df 100644 --- a/crates/codegen/src/syntax_kind.rs +++ b/crates/codegen/src/syntax_kind.rs @@ -1,5 +1,4 @@ use std::collections::HashSet; -use std::env::current_dir; use pg_query_proto_parser::{Node, ProtoParser, Token}; use proc_macro2::{Ident, Literal}; diff --git a/crates/parser/Cargo.toml b/crates/parser/Cargo.toml index 2049bace..65e25a5f 100644 --- a/crates/parser/Cargo.toml +++ b/crates/parser/Cargo.toml @@ -14,6 +14,7 @@ regex = "1.9.1" serde = { version = "1.0", features = ["derive"] } env_logger = { version = "0.9.1" } log = { version = "0.4.20" } +lazy_static = "1.4.0" codegen.workspace = true pg_query_proto_parser.workspace = true diff --git a/crates/parser/src/lib.rs b/crates/parser/src/lib.rs index e871b2a5..90b2f9a2 100644 --- a/crates/parser/src/lib.rs +++ b/crates/parser/src/lib.rs @@ -18,8 +18,8 @@ mod ast_node; mod parser; mod sibling_token; -mod source_file; -mod statement; +mod source_parser; +mod statement_parser; mod syntax_error; mod syntax_kind_codegen; mod syntax_node; diff --git a/crates/parser/src/source_file.rs b/crates/parser/src/source_file.rs deleted file mode 100644 index 9ae9b641..00000000 --- a/crates/parser/src/source_file.rs +++ /dev/null @@ -1,107 +0,0 @@ -use logos::Logos; - -use crate::{parser::Parser, syntax_kind_codegen::SyntaxKind}; - -/// A super simple lexer for sql files that splits the input into indivudual statements and -/// comments. -/// -/// pg_query.rs only parses valid statements, and also fail to parse all statements if any contain syntax errors. -/// To circumvent this, we use a lexer to split the input into statements, and then parse each statement individually. -/// -/// This regex-based lexer does the split. -#[derive(Logos, Debug, PartialEq)] -#[logos(skip r"[ \t\f]+")] // Ignore this regex pattern between tokens -pub enum SourceFileToken { - #[regex("[a-zA-Z0-9_]+(?:'[^']*'|(?:\\$\\$[^$]*\\$\\$|[^';])+)*;"gm)] - Statement, - #[regex("\n+"gm)] - Newline, - #[regex("/\\*[^*]*\\*+(?:[^/*][^*]*\\*+)*/|--[^\n]*"g)] - Comment, -} - -impl Parser { - /// Parse a source file - /// - /// TODO: rename to `parse_source_at(text: &str, at: Option)`, and allow parsing substatements, e.g. bodies of create - /// function statements. - pub fn parse_source_file(&mut self, text: &str) { - let mut lexer = SourceFileToken::lexer(text); - - self.start_node_at(SyntaxKind::SourceFile, 0); - while let Some(token) = lexer.next() { - match token { - Ok(token) => { - match token { - SourceFileToken::Comment => { - self.token(SyntaxKind::Comment, lexer.slice()); - } - SourceFileToken::Newline => { - self.token(SyntaxKind::Newline, lexer.slice()); - } - SourceFileToken::Statement => { - self.parse_statement(lexer.slice(), Some(lexer.span().start as u32)); - } - }; - } - Err(_) => panic!("Unknown SourceFileToken: {:?}", lexer.span()), - } - } - self.finish_node(); - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_source_file_lexer() { - let input = "select * from contact where id = '123';\n\n-- test comment\n\nselect wrong statement;\n\nselect id,username from contact\n\nselect id,name\nfrom contact -- test inline comment\nwhere id = '123';\n\n"; - - let mut lex = SourceFileToken::lexer(&input); - - assert_eq!(lex.next(), Some(Ok(SourceFileToken::Statement))); - assert_eq!(lex.slice(), "select * from contact where id = '123';"); - - assert_eq!(lex.next(), Some(Ok(SourceFileToken::Newline))); - - assert_eq!(lex.next(), Some(Ok(SourceFileToken::Comment))); - assert_eq!(lex.slice(), "-- test comment"); - - assert_eq!(lex.next(), Some(Ok(SourceFileToken::Newline))); - - assert_eq!(lex.next(), Some(Ok(SourceFileToken::Statement))); - assert_eq!(lex.slice(), "select wrong statement;"); - - assert_eq!(lex.next(), Some(Ok(SourceFileToken::Newline))); - - assert_eq!(lex.next(), Some(Ok(SourceFileToken::Statement))); - assert_eq!(lex.slice(), "select id,username from contact\n\nselect id,name\nfrom contact -- test inline comment\nwhere id = '123';"); - } - - #[test] - fn test_source_file_parser() { - let input = "select id, name from users where id = '1224'; - -select select; - - - - - -select 1; - -"; - - let mut parser = Parser::new(); - parser.parse_source_file(input); - let parsed = parser.finish(); - - dbg!(parsed.errors); - - dbg!(&parsed.cst); - - assert_eq!(parsed.cst.text(), input); - } -} diff --git a/crates/parser/src/source_parser.rs b/crates/parser/src/source_parser.rs new file mode 100644 index 00000000..b7a727ec --- /dev/null +++ b/crates/parser/src/source_parser.rs @@ -0,0 +1,180 @@ +use cstree::text::{TextRange, TextSize}; +use lazy_static::lazy_static; +use regex::Regex; + +use crate::{parser::Parser, syntax_kind_codegen::SyntaxKind}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum SourceFileToken { + Statement, + Newline, + Comment, +} + +// Thanks to the `regex` crate, we can precompile the regular expression +lazy_static! { + static ref PATTERN_LEXER: Regex = Regex::new(r"(?P[a-zA-Z0-9_]+(?:'[^']*'|(?:\\$\\$[^$]*\\$\\$|[^';])+)*;)|(?P/\\*[^*]*\\*+(?:[^/*][^*]*\\*+)*/|--[^\n]*)|(?P\n+)").unwrap(); +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct Token { + kind: SourceFileToken, + text: String, + span: TextRange, +} + +/// A super simple lexer for sql files that splits the input into individual statements and +/// comments. +/// +/// pg_query.rs only parses valid statements, and also fail to parse all statements if any contain syntax errors. +/// To circumvent this, we use a lexer to split the input into statements, and then parse each statement individually. +/// +/// This regex-based lexer does the split. +/// +/// We cannot use logos because it uses `regex-syntax`, which does not support all regex syntax. +fn tokens(input: &str) -> Vec { + let mut tokens = Vec::new(); + let mut offset = 0; + + for cap in PATTERN_LEXER.captures_iter(&input) { + let len: u32 = if let Some(statement) = cap.name("statement") { + let l = u32::try_from(statement.as_str().len()).unwrap(); + tokens.push(Token { + kind: SourceFileToken::Statement, + text: statement.as_str().to_string(), + span: TextRange::new(TextSize::from(offset), TextSize::from(offset + l)), + }); + l + } else if let Some(comment) = cap.name("comment") { + let l = u32::try_from(comment.as_str().len()).unwrap(); + tokens.push(Token { + kind: SourceFileToken::Comment, + text: comment.as_str().to_string(), + span: TextRange::new(TextSize::from(offset), TextSize::from(offset + l)), + }); + l + } else if let Some(newline) = cap.name("newline") { + let l = u32::try_from(newline.as_str().len()).unwrap(); + tokens.push(Token { + kind: SourceFileToken::Newline, + text: newline.as_str().to_string(), + span: TextRange::new( + TextSize::try_from(offset).unwrap(), + TextSize::from(offset + l), + ), + }); + l + } else { + panic!("No match"); + }; + + offset += u32::from(len); + } + + tokens +} + +impl Parser { + /// Parse a source + pub fn parse_source_at(&mut self, text: &str, at_offset: Option) { + let offset = at_offset.unwrap_or(0); + + let tokens = tokens(&text); + let mut tokens_iter = tokens.iter(); + + self.start_node_at(SyntaxKind::SourceFile, 0); + while let Some(token) = tokens_iter.next() { + match token.kind { + SourceFileToken::Comment => { + self.token(SyntaxKind::Comment, token.text.as_str()); + } + SourceFileToken::Newline => { + self.token(SyntaxKind::Newline, token.text.as_str()); + } + SourceFileToken::Statement => { + self.parse_statement( + token.text.as_str(), + Some(offset + u32::from(token.span.start())), + ); + } + }; + } + self.finish_node(); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_source_file_lexer() { + let input = "select * from contact where id = '123';\n\n-- test comment\n\nselect wrong statement;\n\nselect id,username from contact\n\nselect id,name\nfrom contact -- test inline comment\nwhere id = '123';\n\n"; + + let tokens = tokens(input); + let mut tokens_iter = tokens.iter(); + + let token = tokens_iter.next().unwrap(); + assert_eq!(token.kind, SourceFileToken::Statement); + assert_eq!(token.text, "select * from contact where id = '123';"); + + let token = tokens_iter.next().unwrap(); + assert_eq!(token.kind, SourceFileToken::Newline); + + let token = tokens_iter.next().unwrap(); + assert_eq!(token.kind, SourceFileToken::Comment); + assert_eq!(token.text, "-- test comment"); + + let token = tokens_iter.next().unwrap(); + assert_eq!(token.kind, SourceFileToken::Newline); + + let token = tokens_iter.next().unwrap(); + assert_eq!(token.kind, SourceFileToken::Statement); + assert_eq!(token.text, "select wrong statement;"); + + let token = tokens_iter.next().unwrap(); + assert_eq!(token.kind, SourceFileToken::Newline); + + let token = tokens_iter.next().unwrap(); + assert_eq!(token.kind, SourceFileToken::Statement); + assert_eq!(token.text, "select id,username from contact\n\nselect id,name\nfrom contact -- test inline comment\nwhere id = '123';"); + } + + #[test] + fn test_source_file_parser() { + let input = "select id, name from users where id = '1224'; + +select select; + + + + + +select 1; + +"; + + let mut parser = Parser::new(); + parser.parse_source_at(input, None); + let parsed = parser.finish(); + + assert_eq!(parsed.cst.text(), input); + } + + #[test] + fn test_lexer_with_nested_statements() { + let input = "select * from test; + +select 123; + +CREATE FUNCTION dup(in int, out f1 int, out f2 text) + AS $$ SELECT $1, CAST($1 AS text) || ' is text;' $$ + LANGUAGE SQL;"; + + let mut parser = Parser::new(); + parser.parse_source_at(input, None); + let parsed = parser.finish(); + + assert_eq!(parsed.cst.text(), input); + } +} diff --git a/crates/parser/src/statement.rs b/crates/parser/src/statement_parser.rs similarity index 99% rename from crates/parser/src/statement.rs rename to crates/parser/src/statement_parser.rs index 51b488b5..d0f6a25c 100644 --- a/crates/parser/src/statement.rs +++ b/crates/parser/src/statement_parser.rs @@ -173,8 +173,6 @@ mod tests { parser.parse_statement(input, None); let parsed = parser.finish(); - dbg!(&parsed.cst); - assert_eq!(parsed.cst.text(), input); } @@ -188,8 +186,6 @@ mod tests { parser.parse_statement(input, None); let parsed = parser.finish(); - dbg!(&parsed.cst); - assert_eq!(parsed.cst.text(), input); } } diff --git a/crates/postgres_lsp/src/main.rs b/crates/postgres_lsp/src/main.rs index 11019ddb..88c066e4 100644 --- a/crates/postgres_lsp/src/main.rs +++ b/crates/postgres_lsp/src/main.rs @@ -245,7 +245,7 @@ impl Backend { let rope = ropey::Rope::from_str(¶ms.text); let mut parser = Parser::new(); - parser.parse_source_file(¶ms.text); + parser.parse_source_at(¶ms.text, None); let result = parser.finish(); From 80b125775a15a458ffe6abb4f5919d141ebaa5d9 Mon Sep 17 00:00:00 2001 From: psteinroe Date: Thu, 21 Sep 2023 16:36:00 +0300 Subject: [PATCH 2/2] chore: cleanup --- Cargo.lock | 2 -- crates/codegen/Cargo.toml | 2 -- 2 files changed, 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7c6686c6..1abb92bf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -191,8 +191,6 @@ dependencies = [ "pg_query_proto_parser", "proc-macro2", "quote", - "regex", - "syn 2.0.35", ] [[package]] diff --git a/crates/codegen/Cargo.toml b/crates/codegen/Cargo.toml index 80d0b3d6..86b06851 100644 --- a/crates/codegen/Cargo.toml +++ b/crates/codegen/Cargo.toml @@ -9,8 +9,6 @@ edition = "2021" proc-macro2 = "1.0.66" quote = "1.0.33" pg_query_proto_parser.workspace = true -syn = { version = "2.0.35", features = ["full"] } -regex = "1.9.5" [lib] proc-macro = true