I have the below grammar and it works fine when the xml elements in the are in the same line. However, if they are on multiple lines, it does not. I have tried the “sm” regex flag and it still does not work.
How do I get the contents of the <section>
?
from parsimonious.grammar import Grammar
from parsimonious.nodes import NodeVisitor
grammar = Grammar(
r"""
text = (sections)*
sections = ws* '<sections>' ws* section_element+ ws* '</sections>' ws*
section_element = "<section>" section_content "</section>"
section_content = ws* value ws*
ws = ~"s*"
value = ~"[ a-z < > nrs].*"
"""
)
class TextVisitor(NodeVisitor):
def visit_text(self, node, visited_children):
return node.text
def visit_section_content(self, node, visited_children):
return node.text
def generic_visit(self, node, visited_children):
return visited_children or node
text = """
<sections>
<section>
<abc> aaa </abc> <def>bbb</def>
</section>
</sections>
"""
tree = grammar.parse(text)
tv = TextVisitor()
extracted_text = tv.visit(tree)
print(extracted_text)
Changing
text = """
<sections>
<section>
<abc> aaa </abc> <def>bbb</def>
</section>
</sections>
"""
to
text = """
<sections>
<section>
<abc> aaa </abc>
<def>bbb</def>
</section>
</sections>
"""
does not work.
1