mirror of
https://github.com/aiogram/aiogram.git
synced 2026-04-08 16:37:47 +00:00
Add Bot API parser and code-generator
This commit is contained in:
parent
5e9d4e55d9
commit
af2573dbee
15 changed files with 3242 additions and 1 deletions
134
generator/parser.py
Normal file
134
generator/parser.py
Normal file
|
|
@ -0,0 +1,134 @@
|
|||
import logging
|
||||
|
||||
import requests
|
||||
from lxml import html
|
||||
from lxml.html import HtmlElement
|
||||
|
||||
from generator.consts import DOCS_URL, ANCHOR_HEADER_PATTERN
|
||||
from generator.normalizers import (
|
||||
normalize_type_annotation,
|
||||
normalize_method_annotation,
|
||||
normalize_description,
|
||||
)
|
||||
from generator.structures import Group, Entity, Annotation
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Parser:
|
||||
def __init__(self):
|
||||
self.docs = self.load(DOCS_URL)
|
||||
self.groups = []
|
||||
|
||||
@staticmethod
|
||||
def load_page(url: str) -> str:
|
||||
log.info("Load page %r", url)
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
|
||||
@staticmethod
|
||||
def to_html(content: str, url: str) -> HtmlElement:
|
||||
page = html.fromstring(content, url)
|
||||
|
||||
for br in page.xpath("*//br"):
|
||||
br.tail = "\n" + br.tail if br.tail else "\n"
|
||||
|
||||
return page
|
||||
|
||||
def load(self, url: str) -> HtmlElement:
|
||||
content = self.load_page(url)
|
||||
return self.to_html(content, url)
|
||||
|
||||
def optimize_group(self, group: Group):
|
||||
if not group.childs:
|
||||
log.warning("Remove empty %s", group)
|
||||
self.groups.remove(group)
|
||||
return
|
||||
|
||||
if not group.childs[0].annotations:
|
||||
log.warning("Update group %r description from first child element", group.title)
|
||||
group.description = group.childs[0].description
|
||||
group.childs.pop(0)
|
||||
|
||||
def parse(self):
|
||||
self.groups.clear()
|
||||
|
||||
group = None
|
||||
|
||||
for item in self.docs.xpath("//a[@class='anchor']"): # type: HtmlElement
|
||||
parent_tag: HtmlElement = item.getparent()
|
||||
anchor_name = item.get("name", None)
|
||||
matches = ANCHOR_HEADER_PATTERN.match(parent_tag.tag)
|
||||
if not matches or not anchor_name:
|
||||
continue
|
||||
level = int(matches.group(1))
|
||||
title = item.tail
|
||||
|
||||
if level == 3:
|
||||
if group:
|
||||
self.optimize_group(group)
|
||||
|
||||
log.info("Parse group %r (#%s)", title, anchor_name)
|
||||
group = Group(title=title, anchor=anchor_name)
|
||||
self.groups.append(group)
|
||||
|
||||
if level == 4 and len(title.split()) > 1:
|
||||
continue
|
||||
|
||||
elif anchor_name not in ["recent-changes", "authorizing-your-bot", "making-requests"]:
|
||||
child = self._parse_child(parent_tag, anchor_name)
|
||||
group.childs.append(child)
|
||||
|
||||
return self.groups
|
||||
|
||||
def _parse_child(self, start_tag: HtmlElement, anchor: str):
|
||||
name = start_tag.text_content()
|
||||
description = []
|
||||
annotations = []
|
||||
|
||||
is_method = name[0].islower()
|
||||
|
||||
log.info("Parse block: %r (#%s)", name, anchor)
|
||||
|
||||
for item in self._parse_tags_group(start_tag):
|
||||
if item.tag == "table":
|
||||
for raw in self._parse_table(item):
|
||||
if is_method:
|
||||
normalize_method_annotation(raw)
|
||||
else:
|
||||
normalize_type_annotation(raw)
|
||||
annotations.append(Annotation(**raw))
|
||||
|
||||
elif item.tag == "p":
|
||||
description.extend(item.text_content().splitlines())
|
||||
elif item.tag == "blockquote":
|
||||
description.extend(self._parse_blockquote(item))
|
||||
elif item.tag == "ul":
|
||||
description.extend(self._parse_list(item))
|
||||
|
||||
description = normalize_description("\n".join(description))
|
||||
block = Entity(anchor=anchor, name=name, description=description, annotations=annotations)
|
||||
log.info("%s", block)
|
||||
return block
|
||||
|
||||
def _parse_tags_group(self, start_tag: HtmlElement):
|
||||
tag: HtmlElement = start_tag.getnext()
|
||||
while tag is not None and tag.tag not in ["h3", "h4"]:
|
||||
yield tag
|
||||
tag: HtmlElement = tag.getnext()
|
||||
|
||||
def _parse_table(self, table: HtmlElement):
|
||||
head, body = table.getchildren() # type: HtmlElement, HtmlElement
|
||||
header = [item.text_content() for item in head.getchildren()[0]]
|
||||
|
||||
for body_item in body:
|
||||
yield {k: v for k, v in zip(header, [item.text_content() for item in body_item])}
|
||||
|
||||
def _parse_blockquote(self, blockquote: HtmlElement):
|
||||
for item in blockquote.getchildren():
|
||||
yield from item.text_content().splitlines()
|
||||
|
||||
def _parse_list(self, data: HtmlElement):
|
||||
for item in data.getchildren():
|
||||
yield " - " + item.text_content()
|
||||
Loading…
Add table
Add a link
Reference in a new issue