Files
browser/http_client/html_parser.py

153 lines
4.8 KiB
Python

SELF_CLOSING_TAGS = [
"area", "base", "br", "col", "embed", "hr", "img", "input",
"link", "meta", "param", "source", "track", "wbr",
]
HEAD_TAGS = [
"base", "basefont", "bgsound", "noscript",
"link", "meta", "title", "style", "script",
]
class Element:
def __init__(self, tag, attributes, parent):
self.tag = tag
self.attributes = attributes
self.children = []
self.parent = parent
def __repr__(self):
attrs = [" " + k + "=\"" + v + "\"" for k, v in self.attributes.items()]
attr_str = ""
for attr in attrs:
attr_str += attr
return "<" + self.tag + attr_str + ">"
class Text:
def __init__(self, text, parent):
self.text = text
self.children = []
self.parent = parent
def __repr__(self):
return repr(self.text)
class HTML():
def __init__(self, raw_html):
self.raw_html = raw_html
self.unfinished = []
self.parse()
def parse(self):
text = ""
in_tag = False
for c in self.raw_html:
if c == "<":
in_tag = True
if text: self.add_text(text) # start of new tag means before everything was content/text
text = ""
elif c == ">":
in_tag = False
self.add_tag(text) # end of a tag means everything in-between were tags
text = ""
else:
text += c
if not in_tag and text:
self.add_text(text)
return self.finish()
def add_text(self, text):
if text.isspace(): return
self.implicit_tags(None)
parent = self.unfinished[-1]
node = Text(text, parent)
parent.children.append(node)
def get_attributes(self, text):
parts = text.split()
tag = parts[0].casefold()
attributes = {}
for attrpair in parts[1:]:
if "=" in attrpair:
key, value = attrpair.split("=", 1)
if len(value) > 2 and value[0] in ["'", "\""]:
value = value[1:-1]
attributes[key.casefold()] = value
else:
attributes[attrpair.casefold()] = ""
return tag, attributes
def add_tag(self, tag):
tag, attributes = self.get_attributes(tag)
if tag.startswith("!"): return
self.implicit_tags(tag)
if tag.startswith("/"):
if len(self.unfinished) == 1: return
node = self.unfinished.pop()
parent = self.unfinished[-1]
parent.children.append(node)
elif tag in SELF_CLOSING_TAGS:
parent = self.unfinished[-1]
node = Element(tag, attributes, parent)
parent.children.append(node)
else:
parent = self.unfinished[-1] if self.unfinished else None
node = Element(tag, attributes, parent)
self.unfinished.append(node)
def implicit_tags(self, tag):
while True:
open_tags = [node.tag for node in self.unfinished]
if open_tags == [] and tag != "html":
self.add_tag("html")
elif open_tags == ["html"] and tag not in ["head", "body", "/html"]:
if tag in HEAD_TAGS:
self.add_tag("head")
else:
self.add_tag("body")
elif open_tags == ["html", "head"] and tag not in ["/head"] + HEAD_TAGS:
self.add_tag("/head")
else:
break
def finish(self):
if not self.unfinished:
self.implicit_tags(None)
while len(self.unfinished) > 1:
node = self.unfinished.pop()
parent = self.unfinished[-1]
parent.children.append(node)
return self.unfinished.pop()
@staticmethod
def print_tree(node, indent=0):
print(" " * indent, node)
for child in node.children:
HTML.print_tree(child, indent + 2)
@staticmethod
def to_json(tree: Element | Text):
if isinstance(tree, Text):
return ["text", tree.text, [HTML.to_json(child) for child in tree.children]]
elif isinstance(tree, Element):
return ["element", tree.tag, tree.attributes, [HTML.to_json(child) for child in tree.children]]
@staticmethod
def from_json(json_list, parent=None):
if json_list[0] == "text":
text = Text(json_list[1], parent)
text.children = [HTML.from_json(child, text) for child in json_list[2]]
return text
elif json_list[0] == "element":
element = Element(json_list[1], json_list[2], parent)
element.children = [HTML.from_json(child, element) for child in json_list[3]]
return element