improve parsing and add basic support for equations

Dav1dde · May 22, 2024 · d87dfc4 · d87dfc4
1 parent 3cd03ec
commit d87dfc4
Show file tree

Hide file tree

Showing 3 changed files with 41 additions and 18 deletions.
diff --git a/glad/documentation.py b/glad/documentation.py
@@ -1,5 +1,5 @@
 from glad.parse import SpecificationDocs, CommandDocs, xml_parse
-from glad.util import prefix, memoize, raw_text
+from glad.util import prefix, suffix, memoize, raw_text
 from shutil import rmtree
 import glad.util
 import subprocess
@@ -65,15 +65,15 @@ def docs_from_html_file(cls, path):
         # Command brief description appears in the first 'refnamediv' block
         brief_block = cls.xml_text(tree.find('.//div[@class="refnamediv"]/p'))
         brief = f'[{path.stem}](https://docs.gl/{path.parent.name}/{path.stem}) — ' \
-            f'{brief_block.split("—")[1]}'
+            f'{suffix(".", brief_block.split("—")[1])}'
 
         # Description parsing
         description = []
         description_blocks = next(
             (s for s in sections if raw_text(s.find('h2')) == 'Description'),
             None,
         )
-        if description_blocks:
+        if description_blocks is not None:
             blocks = description_blocks.findall('./*')
             description = list(
                 filter(
@@ -85,7 +85,7 @@ def docs_from_html_file(cls, path):
         # Notes parsing
         notes = []
         notes_blocks = next((s for s in sections if raw_text(s.find('h2')) == 'Notes'), None)
-        if notes_blocks:
+        if notes_blocks is not None:
             blocks = notes_blocks.findall('./*')
             notes = list(
                 filter(
@@ -102,7 +102,7 @@ def docs_from_html_file(cls, path):
         # file. This means that we have to find the correct block of parameters for each definition.
         funcdefs = [
             d for d in tree.findall('.//*[@class="funcsynopsis"]/*')
-            if d.find('.//*[@class="funcdef"]')
+            if d.find('.//*[@class="funcdef"]') is not None
         ]
         for func_def in funcdefs:
             func_name = func_def.find('.//*[@class="fsfunc"]').text
@@ -114,7 +114,7 @@ def docs_from_html_file(cls, path):
                 (s for s in sections if raw_text(s.find('h2')) == f'Parameters for {func_name}'),
                 None,
             )
-            if not params_block:
+            if not params_block is not None:
                 for p in list(s for s in sections if raw_text(s.find('h2')) == 'Parameters'):
                     block_params = [raw_text(n) for n in p.findall('.//dt//code')]
                     if all(p in block_params for p in func_params):
@@ -141,31 +141,52 @@ def docs_from_html_file(cls, path):
     @staticmethod
     def format(e, is_tail=False):
         if is_tail:
-            if e.tag == 'mfenced':
-                # closing mathjax fences
-                return f'{e.attrib["close"]}'
             if e.tag == 'dt':
                 # closing a definition term
                 return '\n'
-            return e.tail
+            if e.tag == 'mtr':
+                # closing a mathjax row
+                return '\n'
+            r = re.sub(r'\n+', '', e.tail)
+            if e.tag in ('mn', 'msub'):
+                return ''
+            return re.sub(r'\n+', '', e.tail)
 
         if e.tag == 'a':
             return f'![{e.text}]({e.attrib["href"]})'
         if e.tag == 'code':
             return f'`{e.text}`'
-        if e.tag == 'mfenced':
-            return f'{e.attrib["open"]}{e.text}'
         if e.tag == 'dt':
             return f'\n{CommandDocs.BREAK}- '
         if e.tag == 'li':
             return f'\n{CommandDocs.BREAK}-{e.text}'
-        return e.text
+        return re.sub(r'\n+', '', e.text)
 
     @staticmethod
     def xml_text(e):
+        def paren(expr):
+            if re.match(r'^[a-zA-Z0-9_]+$', expr):
+                return expr
+            return f'({expr})'
+
+        def mfenced(e):
+            if e.attrib['close']:
+                return f'{e.attrib["open"]}{", ".join(DocsGL.xml_text(c) for c in e)}{e.attrib["close"]}'
+            return f'{e.attrib["open"]}{" ".join(DocsGL.xml_text(c) for c in e)}'
+
         text = ''.join(glad.util.itertext(
             e,
-            ignore=('table', 'pre'), # tables and code blocks are not supported yet
+            convert={
+                'table': lambda _: f'(table omitted)',
+                'pre': lambda _: f'(code omitted)',
+                'mfrac': lambda e, : f'{paren(DocsGL.xml_text(e[0]))}/{paren(DocsGL.xml_text(e[1]))}',
+                'msup': lambda e: f'{paren(DocsGL.xml_text(e[0]))}^{paren(DocsGL.xml_text(e[1]))}',
+                'msub': lambda e: f'{paren(DocsGL.xml_text(e[0]))}_{paren(DocsGL.xml_text(e[1]))}',
+                'mtd': lambda e: f'{DocsGL.xml_text(e[0])}; ',
+                'mfenced': mfenced,
+            },
             format=DocsGL.format,
         ))
-        return re.sub(r'\n? +', ' ', text.strip())
+        # \u00a0, \u2062, \u2062,
+        # are invisible characters used by docs.gl to separate words.
+        return re.sub(r'\n?[ \u00a0\u2062\u2061]+', ' ', text.strip())
diff --git a/glad/parse.py b/glad/parse.py
@@ -1,4 +1,3 @@
-from tkinter import N
 from glad.sink import LoggingSink
 
 try:

diff --git a/glad/util.py b/glad/util.py
@@ -181,17 +181,20 @@ def _format_none(e, is_tail=False):
     return e.tail if is_tail else e.text
 
 
-def itertext(element, ignore=(), format=_format_none):
+def itertext(element, ignore=(), convert=dict(), format=_format_none):
     tag = element.tag
     if tag in ignore:
         return
+    if tag in convert:
+        yield convert[tag](element)
+        return
 
     if not isinstance(tag, basestring) and tag is not None:
         return
     if element.text:
         yield format(element)
     for e in element:
-        for s in itertext(e, ignore=ignore, format=format):
+        for s in itertext(e, ignore=ignore, convert=convert, format=format):
             yield s
         if e.tail:
             yield format(e, is_tail=True)