From cd8b5e97a58570848e0806e9962f40844a389ca8 Mon Sep 17 00:00:00 2001 From: Matthias Stemmler Date: Sat, 6 Jun 2026 20:30:27 +0200 Subject: [PATCH 1/3] Fix handling of node numbers in queries with multiple alternatives --- CHANGELOG.md | 4 + graphannis/src/annis/db/aql/conjunction.rs | 80 ++++++++++--------- .../src/annis/db/aql/conjunction/tests.rs | 19 +++++ 3 files changed, 64 insertions(+), 39 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f7e16399d..195638345 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed + +- Fixed handling of node numbers in queries with multiple alternatives. + ## [4.1.4] - 2026-04-18 ### Fixed diff --git a/graphannis/src/annis/db/aql/conjunction.rs b/graphannis/src/annis/db/aql/conjunction.rs index 9268699c5..6ceda95a6 100644 --- a/graphannis/src/annis/db/aql/conjunction.rs +++ b/graphannis/src/annis/db/aql/conjunction.rs @@ -112,35 +112,6 @@ fn update_components_for_nodes( } } -fn get_cost_estimates<'a>( - op_spec: &BinaryOperatorSpecEntry, - node2cost: &'a BTreeMap, -) -> Option<(&'a CostEstimate, &'a CostEstimate)> { - if let (Some(cost_lhs), Some(cost_rhs)) = ( - node2cost.get(&op_spec.args.left), - node2cost.get(&op_spec.args.right), - ) { - Some((cost_lhs, cost_rhs)) - } else { - None - } -} - -/// Returns true if it is estimated to switch the operands in a join. -fn should_switch_operand_order( - op_spec: &BinaryOperatorSpecEntry, - node2cost: &BTreeMap, -) -> bool { - if let Some((cost_lhs, cost_rhs)) = get_cost_estimates(op_spec, node2cost) - && cost_rhs.output < cost_lhs.output - { - // switch operands - return true; - } - - false -} - fn create_index_join<'b>( db: &'b AnnotationGraph, config: &Config, @@ -187,7 +158,7 @@ fn create_join<'b>( if exec_right.as_nodesearch().is_some() && let BinaryOperator::Index(op) = op_entry.op { - // we can use directly use an index join + // we can directly use an index join return create_index_join( db, config, @@ -200,7 +171,7 @@ fn create_join<'b>( } if exec_left.as_nodesearch().is_some() { - // avoid a nested loop join by switching the operand and using and index join when possible + // avoid a nested loop join by switching the operand and using an index join when possible if let Some(BinaryOperator::Index(inverse_op)) = op_entry.op.get_inverse_operator(db)? { let inverse_args = BinaryOperatorArguments { left: op_entry.args.right, @@ -395,7 +366,7 @@ impl Conjunction { } /// Return the variable name for a node number. The node number is the - /// position of an node description in the query. In case of a query with + /// position of a node description in the query. In case of a query with /// multiple alternatives, this refers to the node number of the whole query and /// not only the conjunction. pub fn get_variable_by_node_nr(&self, node_nr: usize) -> Option { @@ -493,7 +464,7 @@ impl Conjunction { family_operators.push(best_operator_order.clone()); for i in 0..num_new_generations { - // use the the previous generation as basis + // use the previous generation as basis let mut tmp_operators = family_operators[i].clone(); // randomly select two joins let mut a = 0; @@ -561,7 +532,7 @@ impl Conjunction { if let Some(node_search_cost) = desc.cost.as_ref() { for e in op_spec_entries { let op_spec = &e.op; - if e.args.left == desc.component_nr { + if e.args.left - self.var_idx_offset == desc.component_nr { // get the necessary components and count the number of nodes in these components let components = op_spec.necessary_components(db); if !components.is_empty() { @@ -686,6 +657,37 @@ impl Conjunction { Ok(()) } + fn get_cost_estimates<'a>( + &self, + op_spec: &BinaryOperatorSpecEntry, + node2cost: &'a BTreeMap, + ) -> Option<(&'a CostEstimate, &'a CostEstimate)> { + if let (Some(cost_lhs), Some(cost_rhs)) = ( + node2cost.get(&(op_spec.args.left - self.var_idx_offset)), + node2cost.get(&(op_spec.args.right - self.var_idx_offset)), + ) { + Some((cost_lhs, cost_rhs)) + } else { + None + } + } + + /// Returns true if it is estimated to switch the operands in a join. + fn should_switch_operand_order( + &self, + op_spec: &BinaryOperatorSpecEntry, + node2cost: &BTreeMap, + ) -> bool { + if let Some((cost_lhs, cost_rhs)) = self.get_cost_estimates(op_spec, node2cost) + && cost_rhs.output < cost_lhs.output + { + // switch operands + return true; + } + + false + } + fn add_binary_operator_to_exec_plan<'a>( &'a self, op_spec_entry: &BinaryOperatorSpecEntry, @@ -699,14 +701,14 @@ impl Conjunction { ) -> Result<()> { let mut op: BinaryOperator<'a> = op_spec_entry .op - .create_operator(g, get_cost_estimates(op_spec_entry, &helper.node2cost))?; + .create_operator(g, self.get_cost_estimates(op_spec_entry, &helper.node2cost))?; let mut spec_idx_left = op_spec_entry.args.left; let mut spec_idx_right = op_spec_entry.args.right; let inverse_op = op.get_inverse_operator(g)?; if let Some(inverse_op) = inverse_op - && should_switch_operand_order(op_spec_entry, &helper.node2cost) + && self.should_switch_operand_order(op_spec_entry, &helper.node2cost) { spec_idx_left = op_spec_entry.args.right; spec_idx_right = op_spec_entry.args.left; @@ -714,7 +716,7 @@ impl Conjunction { op = inverse_op; } - // substract the offset from the specificated numbers to get the internal node number for this conjunction + // subtract the offset from the specified numbers to get the internal node number for this conjunction spec_idx_left -= self.var_idx_offset; spec_idx_right -= self.var_idx_offset; @@ -850,7 +852,7 @@ impl Conjunction { )?; } - // apply the the node error check + // apply the node error check if !output.node_search_errors.is_empty() { return Err(output.node_search_errors.remove(0)); } @@ -910,7 +912,7 @@ impl Conjunction { && first != *cid { // add location and description which node is not connected - let n_var = &self.nodes[*node_nr].var; + let n_var = &self.nodes[*node_nr - self.var_idx_offset].var; let location = self.location_in_query.get(n_var); return Err(GraphAnnisError::AQLSemanticError(AQLError { diff --git a/graphannis/src/annis/db/aql/conjunction/tests.rs b/graphannis/src/annis/db/aql/conjunction/tests.rs index 63a81b2ae..7e9fa0653 100644 --- a/graphannis/src/annis/db/aql/conjunction/tests.rs +++ b/graphannis/src/annis/db/aql/conjunction/tests.rs @@ -156,6 +156,25 @@ fn semantic_error_unbound() { } } +#[test] +fn semantic_error_unbound_in_non_first_alternative() { + let config = Config::default(); + let g = AnnotationGraph::with_default_graphstorages(false).unwrap(); + // Same as above, but the unbound variable is in a non-first alternative, + // whose nodes have a non-zero global offset. This asserts that the global + // node numbers are correctly translated to local indices. + let q = aql::parse("tok . tok | tok & tok", false).unwrap(); + let plan = ExecutionPlan::from_disjunction(&q, &g, &config, TimeoutCheck::new(None)); + match plan { + Err(AQLSemanticError(err)) => assert_eq!( + "Variable \"#4\" not bound (use linguistic operators)", + err.desc + ), + Err(err) => panic!("Query must return a semantic error, but returned {}", err), + Ok(_) => panic!("Query must return an error"), + } +} + #[test] fn semantic_error_optional_non_negated() { match aql::parse("tok? & tok? & #1 . #2", false) { From 5ec35fe2febbdfa8d03ea91a48cf3346cdf83f73 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Thu, 25 Jun 2026 14:30:44 +0200 Subject: [PATCH 2/3] Add comments that specify in which fields node numbers or variable names are used --- graphannis/src/annis/db/aql/conjunction.rs | 33 ++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/graphannis/src/annis/db/aql/conjunction.rs b/graphannis/src/annis/db/aql/conjunction.rs index 6ceda95a6..0259d55d5 100644 --- a/graphannis/src/annis/db/aql/conjunction.rs +++ b/graphannis/src/annis/db/aql/conjunction.rs @@ -30,7 +30,11 @@ use std::sync::Arc; #[derive(Debug)] pub struct BinaryOperatorArguments { + /// *Global* position int the query of the LHS. + /// This references not the position in the conjunction, but over all alternatives. pub left: usize, + /// *Global* position int the query of the RHS. + /// This references not the position in the conjunction, but over all alternatives. pub right: usize, pub global_reflexivity: bool, } @@ -64,19 +68,37 @@ pub struct NodeSearchSpecEntry { pub optional: bool, } +/// Represents a conjunction of node search descriptions that have a name (the +/// variable) and a position in the conjunction itself (the internal node +/// number). Conjunctions can be part of a [`Disjunction`], which have their own +/// different node numbers but can share the variable names. #[derive(Debug)] pub struct Conjunction { + /// The node search information for all the variables in the query. Indexed + /// by the position of the variable in the conjunction (first node at index + /// 0, second at 1 etc.). nodes: Vec, binary_operators: Vec, unary_operators: Vec, + /// Maps the variable names to the position of the variable in the conjunction. variables: HashMap, + /// Stores the location of a variable in the query string. location_in_query: HashMap, + /// Variable names in this set should be part of the output for `find`-queries. include_in_output: HashSet, + /// Queries are a disjunction of different alternative conjunctions. If + /// there is more than one alternative, each the variable ids (#1, #2, #3, + /// ...) of each conjunction have an offset. Since the number of variables + /// in each conjunction does not need to be the same, we can't derive the + /// offset from the number of alternatives, but have to remember the + /// variable ID offset for each conjunction. var_idx_offset: usize, } struct ExecutionPlanHelper { node2component: BTreeMap, + /// Cost estimates for different nodes of the query. The node is determined + /// by the position of the variable in the conjunction. node2cost: BTreeMap, } @@ -204,6 +226,8 @@ fn create_join<'b>( } impl Conjunction { + /// Create new conjunction without a configured offset for its variable IDs. + /// e.g. if there is no parent disjunction or only one alternative. pub fn new() -> Conjunction { Conjunction { nodes: vec![], @@ -216,6 +240,7 @@ impl Conjunction { } } + /// Create new conjunction with a configured offset for its variable IDs. pub fn with_offset(var_idx_offset: usize) -> Conjunction { Conjunction { nodes: vec![], @@ -344,6 +369,8 @@ impl Conjunction { self.nodes.len() } + /// Retrieve the global position (overall disjunctions) for a given variable + /// name. pub fn resolve_variable_pos( &self, variable: &str, @@ -368,7 +395,7 @@ impl Conjunction { /// Return the variable name for a node number. The node number is the /// position of a node description in the query. In case of a query with /// multiple alternatives, this refers to the node number of the whole query and - /// not only the conjunction. + /// not only the one inside the conjunction. pub fn get_variable_by_node_nr(&self, node_nr: usize) -> Option { let pos = node_nr.checked_sub(self.var_idx_offset)?; self.nodes.get(pos).map(|spec| spec.var.clone()) @@ -866,6 +893,8 @@ impl Conjunction { } fn check_components_connected(&self) -> Result<()> { + // Maps the global node number (over all alternatives of the + // disjunction) to the component number. let mut node2component: BTreeMap = BTreeMap::new(); node2component.extend( self.nodes @@ -873,7 +902,7 @@ impl Conjunction { .enumerate() // Exclude all optional nodes from the component calculation .filter(|(_i, n)| !n.optional) - // Use the global node number when there are several conjunctions + // Use the global node number in case there are several conjunctions. .map(|(i, _n)| self.var_idx_offset + i) // Set the node position as initial unique component number .map(|i| (i, i)), From 4a1279064011fbedf03d88010f03742040e482e3 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Thu, 25 Jun 2026 15:06:25 +0200 Subject: [PATCH 3/3] Wording --- graphannis/src/annis/db/aql/conjunction.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/graphannis/src/annis/db/aql/conjunction.rs b/graphannis/src/annis/db/aql/conjunction.rs index 0259d55d5..dd3d0ac73 100644 --- a/graphannis/src/annis/db/aql/conjunction.rs +++ b/graphannis/src/annis/db/aql/conjunction.rs @@ -31,10 +31,12 @@ use std::sync::Arc; #[derive(Debug)] pub struct BinaryOperatorArguments { /// *Global* position int the query of the LHS. - /// This references not the position in the conjunction, but over all alternatives. + /// This references not the position in the conjunction, but the position + /// over all alternatives. pub left: usize, /// *Global* position int the query of the RHS. - /// This references not the position in the conjunction, but over all alternatives. + /// This references not the position in the conjunction, but the position + /// over all alternatives. pub right: usize, pub global_reflexivity: bool, }