Open
Description
We have some patches for incremental lookup. Here's what the new Trie
functions look like:
// Copyright 2025 Google LLC.
// SPDX-License-Identifier: Apache-2.0
// Incrementally looks up the trie from the specified query position and node
// ID stored in the given agent. If failed to find a registered key, the agent
// will keep the last query position and node ID from which the prefix of one
// or more registered keys can be matched.
//
// Example:
// The trie is built from 2 keys, "batch", "big".
// // "a" is neither a registered key nor a matched prefix.
// agent.set_query("a");
// ASSERT(trie.incremental_lookup(agent) == MARISA_NO_MATCHING_KEY);
// ASSERT(agent.state().query_pos() == 0);
//
// // "b" is a matched prefix. The outcome query position will be 1 while
// // the node ID will be the position of "b" matched on the trie.
// agent.set_query("b");
// ASSERT(trie.incremental_lookup(agent) ==
// MARISA_PREFIX_MATCHING_KEY_FOUND);
// ASSERT(agent.state().query_pos() == 1);
//
// // "bi" is also a matched prefix. But "ig" might be compacted together
// // in the underlying trie. If so, the outcome query position and node ID
// // are still the values that match "b".
// agent.set_query("bi");
// ASSERT(trie.incremental_lookup(agent) ==
// MARISA_PREFIX_MATCHING_KEY_FOUND);
// ASSERT(agent.state().query_pos() == 1);
//
// // "big" is a registered key. The outcome query position and node ID are
// // the values that match the whole query.
// agent.set_query("big");
// ASSERT(trie.incremental_lookup(agent) == MARISA_MATCHING_KEY_FOUND);
// ASSERT(agent.state().query_pos() == 3);
//
marisa_lookup_result_code incremental_lookup(Agent &agent) const;
// Similar to above one, this function can also support wildcard.
//
// Caller should call this function multiple times until it returns
// MARISA_NO_MATCHING_KEY_FOUND to enumerate all search results of a wildcard
// lookup query.
//
// Note that we can only handle at most one wildcard character per lookup, and
// the wildcard must be at the end of query. If the original query string
// contains multiple wildcards, caller must split it into several queries
// and call this function multiple times. It's also the caller's
// responsibility to maintain all search states of a lookup, and use them for
// following lookup.
//
// Input:
// agent.state.wildcard: the wildcard character. The valid range is
// [0, 255]. Wildcard lookup will be ignored if its value is invalid.
// agent.query: the query string.
// agent.state: the last matched state.
//
// Output:
// The lookup result of query.
//
// Example:
// // Trie: {'batch', 'big'}
// // Query: {'b*g'}
// agent.set_query("b*");
// StateData state_data;
// state_data.Reset();
// state_data.wildcard = '*';
// agent.restore_state(state_data);
//
// // "b*" can match two search states.
// ASSERT(trie.incremental_lookup_with_wildcard(agent) ==
// MARISA_PREFIX_MATCHING_KEY_FOUND);
// states.push_back();
// agent.store_state(&states.back());
// ASSERT(trie.incremental_lookup_with_wildcard(agent) ==
// MARISA_PREFIX_MATCHING_KEY_FOUND);
// states.push_back();
// agent.store_state(&states.back());
// ASSERT(trie.incremental_lookup_with_wildcard(agent) ==
// MARISA_NO_MATCHING_KEY_FOUND);
//
// agent.set_query("b*g");
// for(const auto& state: states) {
// agent.restore_state(state);
// // Only one lookup will return MARISA_MATCHING_KEY_FOUND, the other
// // will return MARISA_NO_MATCHING_KEY_FOUND.
// result = trie.incremental_lookup_with_wildcard(agent);
// // handle result ...
// }
marisa_lookup_result_code incremental_lookup_with_wildcard(
Agent &agent) const;
// The following three functions can be used to traverse the trie by node id.
// Gets a child node of given node id, this function can be called repeatly to
// get all children.
//
// Input:
// agent.query.id: The node id. The root node id for marisa trie is 0.
//
// Output:
// Returning true means a valid child node is found. In that case, the
// node_id field of agent state contains child node id and the key field
// contains the value of corresponding incoming edge.
// Returns false if no more child node is found.
//
// Example:
// // Trie: {'batch', 'big'}
// agent.set_query(0);
// ASSERT(trie.traverse_by_node_id(agent));
// ASSERT(agent.state().node_id() == 1); // Node 'b'
// ASSERT(strncmp(agent.key().ptr(), "b", 1) == 0);
// ASSERT(!trie.traverse_by_node_id(agent));
//
// agent.set_query(1);
// ASSERT(trie.traverse_by_node_id(agent));
// ASSERT(agent.state().node_id() == 2); // Node 'atch'
// ASSERT(strncmp(agent.key().ptr(), "atch", 4) == 0);
// ASSERT(trie.traverse_by_node_id(agent));
// ASSERT(agent.state().node_id() == 3); // Node 'ig'
// ASSERT(strncmp(agent.key().ptr(), "ig", 2) == 0);
// ASSERT(!trie.traverse_by_node_id(agent));
bool traverse_by_node_id(Agent &agent) const;
// Given a node id, retrieves the value of corresponding incoming edge or
// full path.
//
// Input:
// agent.query.id: The node id. The root node id for marisa trie is 0.
// agent.query.reverse_lookup_type: Indicates the type of result value.
// Default value is is Key.
//
// Output:
// Returns true if succeed. Lookup result is stored in agent.key.
// Returns false if failed.
//
// Example:
// // Trie: {'batch', 'big'}
// agent.set_query(2);
// agent.set_reverse_lookup_type(REVERSE_LOOKUP_KEY);
// ASSERT(trie.reverse_lookup_by_node_id(agent));
// ASSERT(strncmp(agent.key().ptr(), "batch", 5) == 0);
//
// agent.set_reverse_lookup_type(REVERSE_LOOKUP_EDGE);
// ASSERT(trie.reverse_lookup_by_node_id(agent));
// ASSERT(strncmp(agent.key().ptr(), "atch", 4) == 0);
bool reverse_lookup_by_node_id(Agent &agent) const;
// Given a node id, convert it to key id if it's a leaf node.
//
// Input:
// agent.query.id: The node id.
//
// Output:
// Returns true if succeed, the key id is stored in agent.key.id.
// Returns false if failed.
//
// Example:
// // Trie: {'batch', 'big'}
// agent.set_query(2);
// ASSERT(trie.lookup_key_id_by_node_id(agent));
// ASSERT(agent.key().id() == 0);
bool lookup_key_id_by_node_id(Agent &agent) const;
I can send PRs for this if it looks interesting.