From c09df3a77094d51a7296abee92f3eb7cd94bb0db Mon Sep 17 00:00:00 2001
From: drewcassidy <drewcassidy@me.com>
Date: Tue, 27 Apr 2021 18:56:53 -0700
Subject: [PATCH] Refactor changelog class and make tokenizer seperate

---
 tests/common.py                            |   5 +-
 tests/test_cli.py                          |   6 +-
 yaclog/changelog.py                        | 158 +++++++--------------
 yaclog/cli/__main__.py                     |  12 +-
 yaclog/markdown.py                         | 153 ++++++++++++++++++++
 yaclog/{cli/version_util.py => version.py} |   2 +
 6 files changed, 222 insertions(+), 114 deletions(-)
 create mode 100644 yaclog/markdown.py
 rename yaclog/{cli/version_util.py => version.py} (99%)
diff --git a/tests/common.py b/tests/common.py
index 9ad0c75..8b9a797 100644
--- a/tests/common.py
+++ b/tests/common.py
@@ -1,6 +1,9 @@
 import datetime
 import os.path
 import textwrap
+
+import changelog
+import version
 import yaclog.changelog
 
 log_segments = [
@@ -47,7 +50,7 @@ log_text = '\n\n'.join(log_segments)
 log = yaclog.Changelog()
 log.header = '# Changelog\n\nThis changelog is for testing the parser, and has many things in it that might trip it up.'
 log.links = {'id': 'http://www.koalastothemax.com'}
-log.versions = [yaclog.changelog.VersionEntry(), yaclog.changelog.VersionEntry(), yaclog.changelog.VersionEntry()]
+log.versions = [changelog.VersionEntry(), changelog.VersionEntry(), changelog.VersionEntry()]
 
 log.versions[0].name = '[Tests]'
 log.versions[0].sections = {
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 1ee2a33..39287ae 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -2,6 +2,8 @@ import unittest
 import os.path
 import git
 
+import changelog
+import version
 import yaclog
 from yaclog.cli.__main__ import cli
 from click.testing import CliRunner
@@ -64,7 +66,7 @@ class TestTagging(unittest.TestCase):
 
         with runner.isolated_filesystem():
             in_log = yaclog.Changelog(location)
-            in_log.versions = [yaclog.changelog.VersionEntry(), yaclog.changelog.VersionEntry()]
+            in_log.versions = [changelog.VersionEntry(), changelog.VersionEntry()]
 
             in_log.versions[0].name = '1.0.0'
             in_log.versions[1].name = '0.9.0'
@@ -92,7 +94,7 @@ class TestTagging(unittest.TestCase):
         with runner.isolated_filesystem():
             in_log = yaclog.Changelog(location)
             in_log.versions = [None, None]
-            in_log.versions = [yaclog.changelog.VersionEntry(), yaclog.changelog.VersionEntry()]
+            in_log.versions = [changelog.VersionEntry(), changelog.VersionEntry()]
 
             in_log.versions[0].name = '1.0.0'
             in_log.versions[0].tags = ['TAG1']
diff --git a/yaclog/changelog.py b/yaclog/changelog.py
index caf15da..b4bf0d3 100644
--- a/yaclog/changelog.py
+++ b/yaclog/changelog.py
@@ -17,49 +17,13 @@
 import datetime
 import os
 import re
-from typing import List, Tuple, Optional, Dict
+from typing import List, Optional, Dict
 
-bullets = '+-*'
-brackets = '[]'
-
-code_regex = re.compile(r'^```')
-header_regex = re.compile(r'^(?P<hashes>#+)\s+(?P<contents>[^#]+)(?:\s+#+)?$')
-under1_regex = re.compile(r'^=+\s*$')
-under2_regex = re.compile(r'^-+\s*$')
-bullet_regex = re.compile(r'^[-+*]')
-linkid_regex = re.compile(r'^\[(?P<link_id>\S*)]:\s*(?P<link>.*)')
+import markdown
 
 default_header = '# Changelog\n\nAll notable changes to this project will be documented in this file'
 
 
-def _strip_link(token):
-    if link_literal := re.fullmatch(r'\[(.*?)]\((.*?)\)', token):
-        # in the form [name](link)
-        return link_literal[1], link_literal[2], None
-
-    if link_id := re.fullmatch(r'\[(.*?)]\[(.*?)]', token):
-        # in the form [name][id] where id is hopefully linked somewhere else in the document
-        return link_id[1], None, link_id[2].lower()
-
-    return token, None, None
-
-
-def _join_markdown(segments: List[str]) -> str:
-    text: List[str] = []
-    last_bullet = False
-    for segment in segments:
-        is_bullet = bullet_regex.match(segment)
-
-        if not is_bullet or not last_bullet:
-            text.append('')
-
-        text.append(segment)
-
-        last_bullet = is_bullet
-
-    return '\n'.join(text).strip()
-
-
 class VersionEntry:
     """Holds a single version entry in a :py:class:`Changelog`"""
 
@@ -73,16 +37,32 @@ class VersionEntry:
         :param date: When the version was released
         :param tags: The version's tags
         :param link: The version's URL
-        :param link_id: The version's link ID, uses the version name by default when writing
+        :param link_id: The version's link ID
         """
 
         self.name: str = name
+        """The version's name"""
+
         self.date: Optional[datetime.date] = date
+        """WHen the version was released"""
+
         self.tags: List[str] = tags if tags else []
+        """The version's tags"""
+
         self.link: Optional[str] = link
+        """The version's URL"""
+
         self.link_id: Optional[str] = link_id
-        self.line_no: int = -1
+        """The version's link ID, uses the version name by default when writing"""
+
+        self.line_no: Optional[int] = None
+        """What line the version occurs at in the file, or None if the version was not read from a file. 
+        This is not guaranteed to be correct after the changelog has been modified, 
+        and it has no effect on the written file"""
+
         self.sections: Dict[str, List[str]] = {'': []}
+        """The dictionary of change entries in the version, organized by section. 
+        Uncategorized changes have a section of an empty string."""
 
     def add_entry(self, contents: str, section: str = '') -> None:
         """
@@ -118,7 +98,7 @@ class VersionEntry:
             if len(entries) > 0:
                 segments += entries
 
-        return _join_markdown(segments)
+        return markdown.join(segments)
 
     def header(self, md: bool = True) -> str:
         """
@@ -167,6 +147,8 @@ class VersionEntry:
 
 
 class Changelog:
+    """A changelog made up of a header, several versions, and a link table"""
+
     def __init__(self, path=None, header: str = default_header):
         """
         Create a new changelog object. Contents will be automatically read from disk if the file exists
@@ -174,10 +156,17 @@ class Changelog:
         :param path: The changelog's path on disk
         :param header: The header at the top of the changelog to use if the file does not exist
         """
-        self.path = path
+        self.path = os.path.abspath(path) if path else None
+        """The path of the changelog's file on disk"""
+
         self.header: str = header
+        """Any text at the top of the changelog before any H2s"""
+
         self.versions: List[VersionEntry] = []
+        """A list of versions in the changelog"""
+
         self.links = {}
+        """Link IDs at the end of the changelog"""
 
         if path and os.path.exists(path):
             self.read()
@@ -195,70 +184,15 @@ class Changelog:
 
         # Read file
         with open(path, 'r') as fp:
-            lines = fp.readlines()
+            tokens, self.links = markdown.tokenize(fp.read())
 
         section = ''
-        in_block = False
-        in_code = False
-
-        segments: List[Tuple[int, List[str], str]] = []
         header_segments = []
 
-        for line_no, line in enumerate(lines):
-            if in_code:
-                # this is the contents of a code block
-                segments[-1][1].append(line)
-                if code_regex.match(line):
-                    in_code = False
-                    in_block = False
+        for token in tokens:
+            text = '\n'.join(token.lines)
 
-            elif code_regex.match(line):
-                # this is the start of a code block
-                in_code = True
-                segments.append((line_no, [line], 'code'))
-
-            elif under1_regex.match(line) and in_block and len(segments[-1][1]) == 1 and segments[-1][2] == 'p':
-                # this is an underline for a setext-style H1
-                # ugly but it works
-                last = segments.pop()
-                segments.append((last[0], last[1] + [line], 'h1'))
-
-            elif under2_regex.match(line) and in_block and len(segments[-1][1]) == 1 and segments[-1][2] == 'p':
-                # this is an underline for a setext-style H2
-                # ugly but it works
-                last = segments.pop()
-                segments.append((last[0], last[1] + [line], 'h2'))
-
-            elif bullet_regex.match(line):
-                in_block = True
-                segments.append((line_no, [line], 'li'))
-
-            elif match := header_regex.match(line):
-                # this is a header
-                kind = f'h{len(match["hashes"])}'
-                segments.append((line_no, [line], kind))
-                in_block = False
-
-            elif match := linkid_regex.match(line):
-                # this is a link definition in the form '[id]: link', so add it to the link table
-                self.links[match['link_id'].lower()] = match['link']
-
-            elif line.isspace():
-                # skip empty lines
-                in_block = False
-
-            elif in_block:
-                # this is a line to be added to a paragraph
-                segments[-1][1].append(line)
-            else:
-                # this is a new paragraph
-                in_block = True
-                segments.append((line_no, [line], 'p'))
-
-        for segment in segments:
-            text = ''.join(segment[1]).strip()
-
-            if segment[2] == 'h2':
+            if token.kind == 'h2':
                 # start of a version
 
                 slug = text.rstrip('-').strip('#').strip()
@@ -270,7 +204,7 @@ class Changelog:
                 section = ''
 
                 version.name = slug
-                version.line_no = segment[0]
+                version.line_no = token.line_no
                 tags = []
                 date = None
 
@@ -288,7 +222,7 @@ class Changelog:
 
                 else:
                     # matches the schema
-                    version.name, version.link, version.link_id = _strip_link(split[0])
+                    version.name, version.link, version.link_id = markdown.strip_link(split[0])
                     version.date = date
                     version.tags = tags
 
@@ -299,7 +233,7 @@ class Changelog:
                 # so its best to just add this line to the header string
                 header_segments.append(text)
 
-            elif segment[2] == 'h3':
+            elif token.kind == 'h3':
                 # start of a version section
                 section = text.strip('#').strip()
                 if section not in self.versions[-1].sections.keys():
@@ -324,7 +258,7 @@ class Changelog:
                 version.link = self.links[version.link_id]
 
         # strip whitespace from header
-        self.header = _join_markdown(header_segments)
+        self.header = markdown.join(header_segments)
 
     def write(self, path: os.PathLike = None) -> None:
         """
@@ -348,7 +282,19 @@ class Changelog:
 
         segments += [f'[{link_id}]: {link}' for link_id, link in v_links.items()]
 
-        text = _join_markdown(segments)
+        text = markdown.join(segments)
 
         with open(path, 'w') as fp:
             fp.write(text)
+
+    def add_version(self, index: int = 0, *args, **kwargs) -> VersionEntry:
+        version = VersionEntry(*args, **kwargs)
+        self.versions.insert(index, version)
+
+        return version
+
+    def current(self, new_version_name='Unreleased') -> VersionEntry:
+        if len(self.versions) == 0:
+            return self.add_version(name=new_version_name)
+
+        return self.versions[0]
diff --git a/yaclog/cli/__main__.py b/yaclog/cli/__main__.py
index e9558a6..6fb4066 100644
--- a/yaclog/cli/__main__.py
+++ b/yaclog/cli/__main__.py
@@ -18,7 +18,9 @@ import click
 import os.path
 import datetime
 import git
-import yaclog.cli.version_util
+
+import changelog
+import yaclog.version
 from yaclog import Changelog
 
 
@@ -141,7 +143,7 @@ def entry(obj: Changelog, bullets, paragraphs, section_name, version_name):
     else:
         matches = [v for v in obj.versions if v.name.lower() == 'unreleased']
         if len(matches) == 0:
-            version = yaclog.changelog.VersionEntry()
+            version = changelog.VersionEntry()
             obj.versions.insert(0, version)
         else:
             version = matches[0]
@@ -192,11 +194,11 @@ def release(obj: Changelog, v_flag, commit):
 
     if v_flag:
         if v_flag[0] == '+':
-            new_name = yaclog.cli.version_util.increment_version(version, v_flag)
+            new_name = yaclog.version.increment_version(version, v_flag)
         else:
             new_name = v_flag
 
-        if yaclog.cli.version_util.is_release(cur_version.name):
+        if yaclog.version.is_release(cur_version.name):
             click.confirm(f'Rename release version "{cur_version.name}" to "{new_name}"?', abort=True)
 
         cur_version.name = new_name
@@ -213,7 +215,7 @@ def release(obj: Changelog, v_flag, commit):
 
         repo.index.add(obj.path)
 
-        version_type = '' if yaclog.cli.version_util.is_release(cur_version.name) else 'non-release '
+        version_type = '' if yaclog.version.is_release(cur_version.name) else 'non-release '
         tracked = len(repo.index.diff(repo.head.commit))
         tracked_warning = 'Create tag'
         untracked = len(repo.index.diff(None))
diff --git a/yaclog/markdown.py b/yaclog/markdown.py
new file mode 100644
index 0000000..d854490
--- /dev/null
+++ b/yaclog/markdown.py
@@ -0,0 +1,153 @@
+#  yaclog: yet another changelog tool
+#  Copyright (c) 2021. Andrew Cassidy
+#
+#  This program is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU Affero General Public License as
+#  published by the Free Software Foundation, either version 3 of the
+#  License, or (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU Affero General Public License for more details.
+#
+#  You should have received a copy of the GNU Affero General Public License
+#  along with this program.  If not, see <https://www.gnu.org/licenses/>.
+import re
+from typing import List
+
+bullets = '+-*'
+brackets = '[]'
+code_regex = re.compile(r'^```')
+header_regex = re.compile(r'^(?P<hashes>#+)\s+(?P<contents>[^#]+)(?:\s+#+)?$')
+li_regex = re.compile(r'^[-+*] |\d+\. ')
+numbered_regex = re.compile(r'^\d+\. ')
+bullet_regex = re.compile(r'^[-+*] ')
+link_id_regex = re.compile(r'^\[(?P<link_id>\S*)]:\s*(?P<link>.*)')
+link_def_regex = re.compile(r'\[(?P<text>.*?)]\[(?P<link_id>.*?)]')  # deferred link in the form [name][id]
+link_lit_regex = re.compile(r'\[(?P<text>.*?)]\((?P<link>.*?)\)')  # literal link in the form [name](url)
+
+setext_h1_replace_regex = re.compile(r'(?<=\n)(?P<header>[^\n]+?)\n=+[ \t]*(?=\n)')
+setext_h2_replace_regex = re.compile(r'(?<=\n)(?P<header>[^\n]+?)\n-+[ \t]*(?=\n)')
+
+
+def strip_link(token):
+    """
+    Parses and removes any links from the token
+
+    :param token: An input token which may be a markdown link, either literal or an ID
+    :return: A tuple of (name, url, id)
+    """
+
+    if link_lit := link_lit_regex.fullmatch(token):
+        # in the form [name](link)
+        return link_lit['text'], link_lit['link'], None
+
+    if link_def := link_def_regex.fullmatch(token):
+        # in the form [name][id] where id is hopefully linked somewhere else in the document
+        return link_def['text'], None, link_def['link_id'].lower()
+
+    return token, None, None
+
+
+def join(segments: List[str]) -> str:
+    """
+    Joins multiple lines of markdown by adding double newlines between them, or a single newline between list items
+
+    :param segments: A list of strings to join
+    :return: A joined markdown string
+    """
+
+    text: List[str] = []
+    last_segment = ''
+    for segment in segments:
+        if bullet_regex.match(segment) and bullet_regex.match(last_segment):
+            pass
+        elif numbered_regex.match(segment) and numbered_regex.match(last_segment):
+            pass
+        else:
+            text.append('')
+
+        text.append(segment)
+
+        last_segment = segment
+
+    return '\n'.join(text).strip()
+
+
+class Token:
+    def __init__(self, line_no: int, lines: List[str], kind: str):
+        self.line_no = line_no
+        self.lines = lines
+        self.kind = kind
+
+    def __str__(self):
+        return f'{self.kind}: {self.lines}'
+
+
+def tokenize(text: str):
+    """
+    Tokenize a markdown string
+
+    The tokenizer is very basic, and only cares about the highest-level blocks
+    (Headers, top-level list items, links, code blocks, paragraphs).
+
+    :param text: input text to tokenize
+    :return: A list of tokens
+    """
+
+    # convert setext-style headers
+    # The extra newline is to preserve line numbers
+    text = setext_h1_replace_regex.sub(r'# \g<header>\n', text)
+    text = setext_h2_replace_regex.sub(r'## \g<header>\n', text)
+
+    lines = text.split('\n')
+    tokens: List[Token] = []
+    links = {}
+
+    # state variables for parsing
+    block = None
+
+    for line_no, line in enumerate(lines):
+        if block == 'code':
+            # this is the contents of a code block
+            assert block == tokens[-1].kind, 'block state variable in invalid state!'
+            tokens[-1].lines.append(line)
+            if code_regex.match(line):
+                block = None
+
+        elif code_regex.match(line):
+            # this is the start of a code block
+            tokens.append(Token(line_no, [line], 'code'))
+            block = 'code'
+
+        elif li_regex.match(line):
+            # this is a list item
+            tokens.append(Token(line_no, [line], 'li'))
+            block = 'li'
+
+        elif match := header_regex.match(line):
+            # this is a header
+            kind = f'h{len(match["hashes"])}'
+            tokens.append(Token(line_no, [line], kind))
+
+        elif match := link_id_regex.match(line):
+            # this is a link definition in the form '[id]: link'
+            links[match['link_id'].lower()] = match['link']
+            block = None
+
+        elif not line or line.isspace():
+            # skip empty lines and reset block
+            block = None
+
+        elif block:
+            # this is a line to be added to a paragraph or list item
+            assert block == tokens[-1].kind, f'block state variable in invalid state! {block} != {tokens[-1].kind}'
+            tokens[-1].lines.append(line)
+
+        else:
+            # this is a new paragraph
+            tokens.append(Token(line_no, [line], 'p'))
+            block = 'p'
+
+    return tokens, links
diff --git a/yaclog/cli/version_util.py b/yaclog/version.py
similarity index 99%
rename from yaclog/cli/version_util.py
rename to yaclog/version.py
index f712b34..bbb2762 100644
--- a/yaclog/cli/version_util.py
+++ b/yaclog/version.py
@@ -81,3 +81,5 @@ def join_version(epoch, release, pre, post, dev, local) -> str:
         parts.append(f"+{local}")
 
     return "".join(parts)
+
+