I wrote a little parser using pyparsing
to parse Google-like search strings, like foo AND (bar OR baz)
(full code below). Like Google, I would like to make the parser fully fault-tolerant. It should ignore errors and parse as much as it can.
I wonder if I should adapt my grammar somehow (which looks very hard to me) or add some preprocessing to make the search string always valid when it gets parsed (but there are quite some corner cases; see invalid expressions in tests below).
I also thought about using pyparsing’s search_string
instead of parse_string
which never seems to raise an exception, but the output is often not really useful for my use case (e.g. foo AND OR bar
=> [[TermNode(WORD, foo)], [BinaryNode(OR, TermNode(WORD, ND), TermNode(WORD, bar))]]
)
import pyparsing as pp
from typing import Literal
class TermNode:
def __init__(self, term_type: Literal["WORD", "PHRASE"], value: "Node"):
self.term_type = term_type
self.value = value
def __repr__(self):
return f"TermNode({self.term_type}, {self.value})"
class UnaryNode:
def __init__(self, operator: Literal["NOT"], operand: "Node"):
self.operator = operator
self.operand = operand
def __repr__(self):
return f"UnaryNode({self.operator}, {self.operand})"
class BinaryNode:
def __init__(self, operator: Literal["AND", "OR"], left: "Node", right: "Node"):
self.operator = operator
self.left = left
self.right = right
def __repr__(self):
return f"BinaryNode({self.operator}, {self.left}, {self.right})"
Node = TermNode | UnaryNode | BinaryNode
not_ = pp.Keyword("NOT")
and_ = pp.Keyword("AND")
or_ = pp.Keyword("OR")
lparen = pp.Literal("(")
rparen = pp.Literal(")")
extra_chars = "_-'"
word = ~(not_ | and_ | or_) + pp.Word(pp.alphanums + pp.alphas8bit + extra_chars).set_parse_action(lambda t: TermNode("WORD", t[0]))
phrase = pp.QuotedString(quoteChar='"').set_parse_action(lambda t: TermNode("PHRASE", t[0]))
term = (phrase | word)
or_expression = pp.Forward()
parens_expression = pp.Forward()
parens_expression <<= (pp.Suppress(lparen) + or_expression + pp.Suppress(rparen)) | term
not_expression = pp.Forward()
not_expression <<= (not_ + not_expression).set_parse_action(lambda t: UnaryNode("NOT", t[1])) | parens_expression
and_expression = pp.Forward()
and_expression <<= (not_expression + and_ + and_expression).set_parse_action(lambda t: BinaryNode("AND", t[0], t[2])) | (not_expression + and_expression).set_parse_action(lambda t: BinaryNode("AND", t[0], t[1])) | not_expression
or_expression <<= (and_expression + or_ + or_expression).set_parse_action(lambda t: BinaryNode("OR", t[0], t[2])) | and_expression
#or_expression.parse_string('', parse_all=True)
or_expression.run_tests("""
###
# Valid expressions
###
# Word term
foobar
# Umlaute in word term
Gürtel
# Phrase term
"foo bar"
# Special characters in phrase
"foo!~ bar %"
# Implicit AND
foo bar
# Explicit AND
foo AND bar
# Explicit OR
foo OR bar
# NOT
NOT foo
# Parenthesis
foo AND (bar OR baz)
# Complex expression 1
NOT foo AND ("bar baz" OR qux)
# Complex expression 2
foo AND (NOT "bar baz" (moo OR zoo) AND yoo)
# Complex expression 3
foo (bar NOT "baz moo") zoo
###
# Invalid expressions
###
# Unary before binary operator
foo NOT AND bar
# Invalid redundant operators
foo AND OR bar
# Unknown char outside quoted terms
foo ~ bar
# Binary operator at start of line
AND foo
# Binary operator at start of parens expression
(AND bar)
# Binary operator at end of line
foo AND
# Binary operator at end of parens expression
(foo AND)
# Unary operator at end of line
foo NOT
# Unary operator at end of parens expression
(foo NOT)
# Unbalanced parens
((foo)
# Unbalanced quotes
""foo"
""");