mirror of
https://github.com/csd4ni3l/browser.git
synced 2026-01-01 04:03:43 +01:00
Initial version, i forgot to commit for a long time.
This commit is contained in:
153
http_client/html_parser.py
Normal file
153
http_client/html_parser.py
Normal file
@@ -0,0 +1,153 @@
|
||||
SELF_CLOSING_TAGS = [
|
||||
"area", "base", "br", "col", "embed", "hr", "img", "input",
|
||||
"link", "meta", "param", "source", "track", "wbr",
|
||||
]
|
||||
|
||||
HEAD_TAGS = [
|
||||
"base", "basefont", "bgsound", "noscript",
|
||||
"link", "meta", "title", "style", "script",
|
||||
]
|
||||
|
||||
class Element:
|
||||
def __init__(self, tag, attributes, parent):
|
||||
self.tag = tag
|
||||
self.attributes = attributes
|
||||
self.children = []
|
||||
self.parent = parent
|
||||
|
||||
def __repr__(self):
|
||||
attrs = [" " + k + "=\"" + v + "\"" for k, v in self.attributes.items()]
|
||||
attr_str = ""
|
||||
for attr in attrs:
|
||||
attr_str += attr
|
||||
return "<" + self.tag + attr_str + ">"
|
||||
|
||||
class Text:
|
||||
def __init__(self, text, parent):
|
||||
self.text = text
|
||||
self.children = []
|
||||
self.parent = parent
|
||||
|
||||
def __repr__(self):
|
||||
return repr(self.text)
|
||||
|
||||
class HTML():
|
||||
def __init__(self, raw_html):
|
||||
self.raw_html = raw_html
|
||||
self.unfinished = []
|
||||
|
||||
self.parse()
|
||||
|
||||
def parse(self):
|
||||
text = ""
|
||||
in_tag = False
|
||||
for c in self.raw_html:
|
||||
if c == "<":
|
||||
in_tag = True
|
||||
if text: self.add_text(text) # start of new tag means before everything was content/text
|
||||
text = ""
|
||||
elif c == ">":
|
||||
in_tag = False
|
||||
self.add_tag(text) # end of a tag means everything in-between were tags
|
||||
text = ""
|
||||
else:
|
||||
text += c
|
||||
|
||||
if not in_tag and text:
|
||||
self.add_text(text)
|
||||
|
||||
return self.finish()
|
||||
|
||||
def add_text(self, text):
|
||||
if text.isspace(): return
|
||||
self.implicit_tags(None)
|
||||
parent = self.unfinished[-1]
|
||||
node = Text(text, parent)
|
||||
parent.children.append(node)
|
||||
|
||||
def get_attributes(self, text):
|
||||
parts = text.split()
|
||||
tag = parts[0].casefold()
|
||||
attributes = {}
|
||||
|
||||
for attrpair in parts[1:]:
|
||||
if "=" in attrpair:
|
||||
key, value = attrpair.split("=", 1)
|
||||
if len(value) > 2 and value[0] in ["'", "\""]:
|
||||
value = value[1:-1]
|
||||
attributes[key.casefold()] = value
|
||||
else:
|
||||
attributes[attrpair.casefold()] = ""
|
||||
|
||||
return tag, attributes
|
||||
|
||||
|
||||
def add_tag(self, tag):
|
||||
tag, attributes = self.get_attributes(tag)
|
||||
|
||||
if tag.startswith("!"): return
|
||||
|
||||
self.implicit_tags(tag)
|
||||
|
||||
if tag.startswith("/"):
|
||||
if len(self.unfinished) == 1: return
|
||||
node = self.unfinished.pop()
|
||||
parent = self.unfinished[-1]
|
||||
parent.children.append(node)
|
||||
elif tag in SELF_CLOSING_TAGS:
|
||||
parent = self.unfinished[-1]
|
||||
node = Element(tag, attributes, parent)
|
||||
parent.children.append(node)
|
||||
else:
|
||||
parent = self.unfinished[-1] if self.unfinished else None
|
||||
node = Element(tag, attributes, parent)
|
||||
self.unfinished.append(node)
|
||||
|
||||
def implicit_tags(self, tag):
|
||||
while True:
|
||||
open_tags = [node.tag for node in self.unfinished]
|
||||
if open_tags == [] and tag != "html":
|
||||
self.add_tag("html")
|
||||
elif open_tags == ["html"] and tag not in ["head", "body", "/html"]:
|
||||
if tag in HEAD_TAGS:
|
||||
self.add_tag("head")
|
||||
else:
|
||||
self.add_tag("body")
|
||||
elif open_tags == ["html", "head"] and tag not in ["/head"] + HEAD_TAGS:
|
||||
self.add_tag("/head")
|
||||
else:
|
||||
break
|
||||
|
||||
def finish(self):
|
||||
if not self.unfinished:
|
||||
self.implicit_tags(None)
|
||||
|
||||
while len(self.unfinished) > 1:
|
||||
node = self.unfinished.pop()
|
||||
parent = self.unfinished[-1]
|
||||
parent.children.append(node)
|
||||
return self.unfinished.pop()
|
||||
|
||||
@staticmethod
|
||||
def print_tree(node, indent=0):
|
||||
print(" " * indent, node)
|
||||
for child in node.children:
|
||||
HTML.print_tree(child, indent + 2)
|
||||
|
||||
@staticmethod
|
||||
def to_json(tree: Element | Text):
|
||||
if isinstance(tree, Text):
|
||||
return ["text", tree.text, [HTML.to_json(child) for child in tree.children]]
|
||||
elif isinstance(tree, Element):
|
||||
return ["element", tree.tag, tree.attributes, [HTML.to_json(child) for child in tree.children]]
|
||||
|
||||
@staticmethod
|
||||
def from_json(json_list, parent=None):
|
||||
if json_list[0] == "text":
|
||||
text = Text(json_list[1], parent)
|
||||
text.children = [HTML.from_json(child, text) for child in json_list[2]]
|
||||
return text
|
||||
elif json_list[0] == "element":
|
||||
element = Element(json_list[1], json_list[2], parent)
|
||||
element.children = [HTML.from_json(child, element) for child in json_list[3]]
|
||||
return element
|
||||
Reference in New Issue
Block a user