input:
<!DOCTYPE html>
<html>
<body>
<h1>My First Heading</h1>
<p>My first paragraph.</p>
</body>
</html>
---
output:
[('h1', 'My First Heading'),
('p', 'My first paragraph.'),
('body', '\n\n<h1>My First Heading</h1>\n<p>My first paragraph.</p>\n\n'),
('html', '\n<body>\n\n<h1>My First Heading</h1>\n<p>My first paragraph.</p>\n\n</body>\n'),
Note: This is not meant to be a completely robust implementation. It obviously does not parse every aspect of html.
The Idea: The key is to identify the grammar of an html tag, and use a stack to facilitate the deconstruction of the string. An html tag has the following simple grammar: html ::= '<' tag '>' exp '</' tag '>' where tag and exp are arbitrary strings. Each token can be uniquely identified on the basis of what came before it within the stack.
Complexity: O(n) time and O(n) space (consider the html string: '<a>b</a>')
from enum import Enum
class Token(Enum):
OPEN1 = 1
OPEN2 = 2
CLOSE1 = 3
CLOSE2 = 4
def html_parser(html_str):
init_html = len("<!DOCTYPE html>")
assert(html_str[0:init_html] == "<!DOCTYPE html>")
html_str = html_str[init_html:]
tag_expr = []
s = []
for i, char in enumerate(html_str):
# this if statement is ultimately just a placeholder for the second statement
if char == '<' and i+1 < len(html_str) and html_str[i+1] != '/':
s.append((i, Token.OPEN1))
# this will be used to identify the start of the expression
elif char == '>' and len(s) != 0 and s[-1][1] == Token.OPEN1:
s.pop()
s.append((i, Token.CLOSE1))
# this identifies the end of the expression
elif char == '<' and i+1 < len(html_str) and s[-1][1] == Token.CLOSE1:
tmp_expr = html_str[s[-1][0] + 1:i]
s.pop()
s.append((i, Token.OPEN2))
# this identifies the tag name (see that the first tag name is ignored)
elif char == '>' and len(s) != 0 and s[-1][1] == Token.OPEN2:
tmp_tag = html_str[s[-1][0] + 2:i]
tag_expr.append((tmp_tag, tmp_expr))
s.pop()
return tag_expr
Testing
ex_doc1 = \
"""<!DOCTYPE html>
<html>
<body>
<h1>My First Heading</h1>
<p>My first paragraph.</p>
</body>
</html>"""
print(html_parser(ex_doc1))
ex_doc2 = \
"""<!DOCTYPE html>
<html>
<body>
<p>
This paragraph
contains a lot of lines
in the source code,
but the browser
ignores it.
</p>
<p>
This paragraph
contains a lot of spaces
in the source code,
but the browser
ignores it.
</p>
<p>
The number of lines in a paragraph depends on the size of the browser window. If you resize the browser window, the number of lines in this paragraph will change.
</p>
</body>
</html>"""
print(html_parser(ex_doc2))