diff --git a/src/ast/mod.rs b/src/ast/mod.rs index 447d89bb4..d21499a7a 100644 --- a/src/ast/mod.rs +++ b/src/ast/mod.rs @@ -844,6 +844,43 @@ impl fmt::Display for CaseWhen { } } +/// Parsing mode for `XMLPARSE`. +#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Eq, Ord, Hash)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))] +pub enum XmlParseMode { + /// `CONTENT` + Content, + /// `DOCUMENT` + Document, +} + +impl fmt::Display for XmlParseMode { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + XmlParseMode::Content => write!(f, "CONTENT"), + XmlParseMode::Document => write!(f, "DOCUMENT"), + } + } +} + +/// `XMLPARSE(CONTENT|DOCUMENT expr)`. +#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))] +pub struct XmlParseExpr { + /// Parsing mode. + pub mode: XmlParseMode, + /// Expression to parse as XML. + pub expr: Box, +} + +impl fmt::Display for XmlParseExpr { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "XMLPARSE({} {})", self.mode, self.expr) + } +} + /// An SQL expression of any type. /// /// # Semantics / Type Checking @@ -1233,6 +1270,8 @@ pub enum Expr { /// This can represent ANSI SQL `DATE`, `TIME`, and `TIMESTAMP` literals (such as `DATE '2020-01-01'`), /// as well as constants of other types (a non-standard PostgreSQL extension). TypedString(TypedString), + /// XML parse expression: `XMLPARSE(CONTENT|DOCUMENT expr)`. + XmlParse(XmlParseExpr), /// Scalar function call e.g. `LEFT(foo, 5)` Function(Function), /// `CASE [] WHEN THEN ... [ELSE ] END` @@ -2015,6 +2054,7 @@ impl fmt::Display for Expr { Expr::Value(v) => write!(f, "{v}"), Expr::Prefixed { prefix, value } => write!(f, "{prefix} {value}"), Expr::TypedString(ts) => ts.fmt(f), + Expr::XmlParse(xml_parse) => xml_parse.fmt(f), Expr::Function(fun) => fun.fmt(f), Expr::Case { case_token: _, diff --git a/src/ast/spans.rs b/src/ast/spans.rs index 360d7707f..f7916d085 100644 --- a/src/ast/spans.rs +++ b/src/ast/spans.rs @@ -1555,6 +1555,7 @@ impl Spanned for Expr { Expr::Nested(expr) => expr.span(), Expr::Value(value) => value.span(), Expr::TypedString(TypedString { value, .. }) => value.span(), + Expr::XmlParse(xml_parse) => xml_parse.expr.span(), Expr::Function(function) => function.span(), Expr::GroupingSets(vec) => { union_spans(vec.iter().flat_map(|i| i.iter().map(|k| k.span()))) diff --git a/src/parser/mod.rs b/src/parser/mod.rs index a7e641f98..808bf53f7 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -2527,8 +2527,43 @@ impl<'a> Parser<'a> { }) } + /// Consume the next token if it is an unquoted word matching `expected` + /// (case-insensitive), returning whether it was consumed. + fn parse_unquoted_word_value(&mut self, expected: &str) -> bool { + if let Token::Word(word) = &self.peek_token_ref().token { + if word.quote_style.is_none() && word.value.eq_ignore_ascii_case(expected) { + self.next_token(); + return true; + } + } + false + } + + fn parse_xml_parse_mode(&mut self) -> Result { + if self.parse_unquoted_word_value("content") { + Ok(XmlParseMode::Content) + } else if self.parse_unquoted_word_value("document") { + Ok(XmlParseMode::Document) + } else { + self.expected_ref("CONTENT or DOCUMENT", self.peek_token_ref()) + } + } + + fn parse_xmlparse_expr(&mut self) -> Result { + self.expect_token(&Token::LParen)?; + let mode = self.parse_xml_parse_mode()?; + let expr = Box::new(self.parse_expr()?); + self.expect_token(&Token::RParen)?; + Ok(Expr::XmlParse(XmlParseExpr { mode, expr })) + } + /// Parse a function call expression named by `name` and return it as an `Expr`. pub fn parse_function(&mut self, name: ObjectName) -> Result { + if self.dialect.supports_xml_expressions() + && Self::is_simple_unquoted_object_name(&name, "xmlparse") + { + return self.parse_xmlparse_expr(); + } self.parse_function_call(name).map(Expr::Function) } diff --git a/tests/sqlparser_common.rs b/tests/sqlparser_common.rs index b561f8935..f7a124a44 100644 --- a/tests/sqlparser_common.rs +++ b/tests/sqlparser_common.rs @@ -19014,6 +19014,46 @@ fn parse_aliased_function_args() { .is_err()); } +#[test] +fn parse_xmlparse() { + let dialects = all_dialects_where(|d| d.supports_xml_expressions()); + + let select = dialects.verified_only_select_with_canonical( + "SELECT xmlparse(content '')", + "SELECT XMLPARSE(CONTENT '')", + ); + match &select.projection[0] { + UnnamedExpr(Expr::XmlParse(XmlParseExpr { mode, .. })) => { + assert_eq!(*mode, XmlParseMode::Content); + } + item => panic!("expected XmlParse expression, got {item:?}"), + } + + let select = dialects.verified_only_select_with_canonical( + "SELECT xmlparse(document '')", + "SELECT XMLPARSE(DOCUMENT '')", + ); + match &select.projection[0] { + UnnamedExpr(Expr::XmlParse(XmlParseExpr { mode, .. })) => { + assert_eq!(*mode, XmlParseMode::Document); + } + item => panic!("expected XmlParse expression, got {item:?}"), + } + + // XMLPARSE requires a CONTENT or DOCUMENT mode. + assert!(dialects + .parse_sql_statements("SELECT xmlparse('')") + .is_err()); + + // On dialects without XML support, `xmlparse` stays a regular function + // and the special `CONTENT ` syntax is rejected. + let others = all_dialects_except(|d| d.supports_xml_expressions()); + others.verified_only_select("SELECT xmlparse(1)"); + assert!(others + .parse_sql_statements("SELECT xmlparse(content '')") + .is_err()); +} + /// Regression test for the 2^N parse-time blowup in `parse_compound_expr` on /// inputs like `IF a0.a1...aN.#`. The parse is run on a worker thread and the /// main thread asserts that it reports back within a generous timeout. Post-fix diff --git a/tests/sqlparser_postgres.rs b/tests/sqlparser_postgres.rs index 713d465a8..ec3b726f8 100644 --- a/tests/sqlparser_postgres.rs +++ b/tests/sqlparser_postgres.rs @@ -3951,6 +3951,47 @@ fn parse_xmlforest_aliased_arguments() { ); } +#[test] +fn parse_xmlparse() { + // Regression statements covering CONTENT and DOCUMENT modes with valid, + // invalid, and edge-case XML strings (parsing only, no semantic checks). + let statements = [ + "SELECT XMLPARSE(CONTENT '')", + "SELECT XMLPARSE(CONTENT ' ')", + "SELECT XMLPARSE(CONTENT 'abc')", + "SELECT XMLPARSE(CONTENT 'x')", + "SELECT XMLPARSE(CONTENT '&')", + "SELECT XMLPARSE(CONTENT '&idontexist;')", + "SELECT XMLPARSE(CONTENT '&idontexist;')", + "SELECT XMLPARSE(CONTENT '')", + "SELECT XMLPARSE(DOCUMENT ' ')", + "SELECT XMLPARSE(DOCUMENT 'abc')", + "SELECT XMLPARSE(DOCUMENT 'x')", + "SELECT XMLPARSE(DOCUMENT '&')", + "SELECT XMLPARSE(DOCUMENT '&idontexist;')", + "SELECT XMLPARSE(DOCUMENT '&idontexist;')", + "SELECT XMLPARSE(DOCUMENT '')", + ]; + for sql in statements { + pg().verified_stmt(sql); + } + + // Lowercase keywords canonicalize to uppercase. + let select = pg().verified_only_select_with_canonical( + "SELECT xmlparse(content '')", + "SELECT XMLPARSE(CONTENT '')", + ); + assert_eq!( + expr_from_projection(&select.projection[0]), + &Expr::XmlParse(XmlParseExpr { + mode: XmlParseMode::Content, + expr: Box::new(Expr::Value( + Value::SingleQuotedString("".to_string()).into() + )), + }) + ); +} + #[test] fn parse_xml_typed_string() { // xml '...' should parse as a TypedString on PostgreSQL and Generic