browser/http_client/html_parser.py

SELF_CLOSING_TAGS = [
    "area", "base", "br", "col", "embed", "hr", "img", "input",
    "link", "meta", "param", "source", "track", "wbr",
]

HEAD_TAGS = [
    "base", "basefont", "bgsound", "noscript",
    "link", "meta", "title", "style", "script",
]

class Element:
    def __init__(self, tag, attributes, parent):
        self.tag = tag
        self.attributes = attributes
        self.children = []
        self.parent = parent

    def __repr__(self):
        attrs = [" " + k + "=\"" + v + "\"" for k, v  in self.attributes.items()]
        attr_str = ""
        for attr in attrs:
            attr_str += attr
        return "<" + self.tag + attr_str + ">"

class Text:
    def __init__(self, text, parent):
        self.text = text
        self.children = []
        self.parent = parent

    def __repr__(self):
        return repr(self.text)

class HTML():
    def __init__(self, raw_html):
        self.raw_html = raw_html
        self.unfinished = []

        self.parse()

    def parse(self):
        text = ""
        in_tag = False
        for c in self.raw_html:
            if c == "<":
                in_tag = True
                if text: self.add_text(text) # start of new tag means before everything was content/text
                text = ""
            elif c == ">":
                in_tag = False
                self.add_tag(text) # end of a tag means everything in-between were tags
                text = ""
            else:
                text += c

        if not in_tag and text:
            self.add_text(text)

        return self.finish()

    def add_text(self, text):
        if text.isspace(): return
        self.implicit_tags(None)
        parent = self.unfinished[-1]
        node = Text(text, parent)
        parent.children.append(node)

    def get_attributes(self, text):
        parts = text.split()
        tag = parts[0].casefold()
        attributes = {}

        for attrpair in parts[1:]:
            if "=" in attrpair:
                key, value = attrpair.split("=", 1)
                if len(value) > 2 and value[0] in ["'", "\""]:
                    value = value[1:-1]
                attributes[key.casefold()] = value
            else:
                attributes[attrpair.casefold()] = ""

        return tag, attributes


    def add_tag(self, tag):
        tag, attributes = self.get_attributes(tag)

        if tag.startswith("!"): return

        self.implicit_tags(tag)

        if tag.startswith("/"):
            if len(self.unfinished) == 1: return
            node = self.unfinished.pop()
            parent = self.unfinished[-1]
            parent.children.append(node)
        elif tag in SELF_CLOSING_TAGS:
            parent = self.unfinished[-1]
            node = Element(tag, attributes, parent)
            parent.children.append(node)
        else:
            parent = self.unfinished[-1] if self.unfinished else None
            node = Element(tag, attributes, parent)
            self.unfinished.append(node)

    def implicit_tags(self, tag):
        while True:
            open_tags = [node.tag for node in self.unfinished]
            if open_tags == [] and tag != "html":
                self.add_tag("html")
            elif open_tags == ["html"] and tag not in ["head", "body", "/html"]:
                if tag in HEAD_TAGS:
                    self.add_tag("head")
                else:
                    self.add_tag("body")
            elif open_tags == ["html", "head"] and tag not in ["/head"] + HEAD_TAGS:
                self.add_tag("/head")
            else:
                break

    def finish(self):
        if not self.unfinished:
            self.implicit_tags(None)

        while len(self.unfinished) > 1:
            node = self.unfinished.pop()
            parent = self.unfinished[-1]
            parent.children.append(node)
        return self.unfinished.pop()

    @staticmethod
    def print_tree(node, indent=0):
        print(" " * indent, node)
        for child in node.children:
            HTML.print_tree(child, indent + 2)

    @staticmethod
    def to_json(tree: Element | Text):
        if isinstance(tree, Text):
            return ["text", tree.text, [HTML.to_json(child) for child in tree.children]]
        elif isinstance(tree, Element):
            return ["element", tree.tag, tree.attributes, [HTML.to_json(child) for child in tree.children]]

    @staticmethod
    def from_json(json_list, parent=None):
        if json_list[0] == "text":
            text = Text(json_list[1], parent)
            text.children = [HTML.from_json(child, text) for child in json_list[2]]
            return text
        elif json_list[0] == "element":
            element = Element(json_list[1], json_list[2], parent)
            element.children = [HTML.from_json(child, element) for child in json_list[3]]
            return element