mirror of
https://github.com/csd4ni3l/browser.git
synced 2025-11-05 04:57:57 +01:00
153 lines
4.8 KiB
Python
153 lines
4.8 KiB
Python
SELF_CLOSING_TAGS = [
|
|
"area", "base", "br", "col", "embed", "hr", "img", "input",
|
|
"link", "meta", "param", "source", "track", "wbr",
|
|
]
|
|
|
|
HEAD_TAGS = [
|
|
"base", "basefont", "bgsound", "noscript",
|
|
"link", "meta", "title", "style", "script",
|
|
]
|
|
|
|
class Element:
|
|
def __init__(self, tag, attributes, parent):
|
|
self.tag = tag
|
|
self.attributes = attributes
|
|
self.children = []
|
|
self.parent = parent
|
|
|
|
def __repr__(self):
|
|
attrs = [" " + k + "=\"" + v + "\"" for k, v in self.attributes.items()]
|
|
attr_str = ""
|
|
for attr in attrs:
|
|
attr_str += attr
|
|
return "<" + self.tag + attr_str + ">"
|
|
|
|
class Text:
|
|
def __init__(self, text, parent):
|
|
self.text = text
|
|
self.children = []
|
|
self.parent = parent
|
|
|
|
def __repr__(self):
|
|
return repr(self.text)
|
|
|
|
class HTML():
|
|
def __init__(self, raw_html):
|
|
self.raw_html = raw_html
|
|
self.unfinished = []
|
|
|
|
self.parse()
|
|
|
|
def parse(self):
|
|
text = ""
|
|
in_tag = False
|
|
for c in self.raw_html:
|
|
if c == "<":
|
|
in_tag = True
|
|
if text: self.add_text(text) # start of new tag means before everything was content/text
|
|
text = ""
|
|
elif c == ">":
|
|
in_tag = False
|
|
self.add_tag(text) # end of a tag means everything in-between were tags
|
|
text = ""
|
|
else:
|
|
text += c
|
|
|
|
if not in_tag and text:
|
|
self.add_text(text)
|
|
|
|
return self.finish()
|
|
|
|
def add_text(self, text):
|
|
if text.isspace(): return
|
|
self.implicit_tags(None)
|
|
parent = self.unfinished[-1]
|
|
node = Text(text, parent)
|
|
parent.children.append(node)
|
|
|
|
def get_attributes(self, text):
|
|
parts = text.split()
|
|
tag = parts[0].casefold()
|
|
attributes = {}
|
|
|
|
for attrpair in parts[1:]:
|
|
if "=" in attrpair:
|
|
key, value = attrpair.split("=", 1)
|
|
if len(value) > 2 and value[0] in ["'", "\""]:
|
|
value = value[1:-1]
|
|
attributes[key.casefold()] = value
|
|
else:
|
|
attributes[attrpair.casefold()] = ""
|
|
|
|
return tag, attributes
|
|
|
|
|
|
def add_tag(self, tag):
|
|
tag, attributes = self.get_attributes(tag)
|
|
|
|
if tag.startswith("!"): return
|
|
|
|
self.implicit_tags(tag)
|
|
|
|
if tag.startswith("/"):
|
|
if len(self.unfinished) == 1: return
|
|
node = self.unfinished.pop()
|
|
parent = self.unfinished[-1]
|
|
parent.children.append(node)
|
|
elif tag in SELF_CLOSING_TAGS:
|
|
parent = self.unfinished[-1]
|
|
node = Element(tag, attributes, parent)
|
|
parent.children.append(node)
|
|
else:
|
|
parent = self.unfinished[-1] if self.unfinished else None
|
|
node = Element(tag, attributes, parent)
|
|
self.unfinished.append(node)
|
|
|
|
def implicit_tags(self, tag):
|
|
while True:
|
|
open_tags = [node.tag for node in self.unfinished]
|
|
if open_tags == [] and tag != "html":
|
|
self.add_tag("html")
|
|
elif open_tags == ["html"] and tag not in ["head", "body", "/html"]:
|
|
if tag in HEAD_TAGS:
|
|
self.add_tag("head")
|
|
else:
|
|
self.add_tag("body")
|
|
elif open_tags == ["html", "head"] and tag not in ["/head"] + HEAD_TAGS:
|
|
self.add_tag("/head")
|
|
else:
|
|
break
|
|
|
|
def finish(self):
|
|
if not self.unfinished:
|
|
self.implicit_tags(None)
|
|
|
|
while len(self.unfinished) > 1:
|
|
node = self.unfinished.pop()
|
|
parent = self.unfinished[-1]
|
|
parent.children.append(node)
|
|
return self.unfinished.pop()
|
|
|
|
@staticmethod
|
|
def print_tree(node, indent=0):
|
|
print(" " * indent, node)
|
|
for child in node.children:
|
|
HTML.print_tree(child, indent + 2)
|
|
|
|
@staticmethod
|
|
def to_json(tree: Element | Text):
|
|
if isinstance(tree, Text):
|
|
return ["text", tree.text, [HTML.to_json(child) for child in tree.children]]
|
|
elif isinstance(tree, Element):
|
|
return ["element", tree.tag, tree.attributes, [HTML.to_json(child) for child in tree.children]]
|
|
|
|
@staticmethod
|
|
def from_json(json_list, parent=None):
|
|
if json_list[0] == "text":
|
|
text = Text(json_list[1], parent)
|
|
text.children = [HTML.from_json(child, text) for child in json_list[2]]
|
|
return text
|
|
elif json_list[0] == "element":
|
|
element = Element(json_list[1], json_list[2], parent)
|
|
element.children = [HTML.from_json(child, element) for child in json_list[3]]
|
|
return element |