From c09df3a77094d51a7296abee92f3eb7cd94bb0db Mon Sep 17 00:00:00 2001 From: drewcassidy Date: Tue, 27 Apr 2021 18:56:53 -0700 Subject: [PATCH] Refactor changelog class and make tokenizer seperate --- tests/common.py | 5 +- tests/test_cli.py | 6 +- yaclog/changelog.py | 158 +++++++-------------- yaclog/cli/__main__.py | 12 +- yaclog/markdown.py | 153 ++++++++++++++++++++ yaclog/{cli/version_util.py => version.py} | 2 + 6 files changed, 222 insertions(+), 114 deletions(-) create mode 100644 yaclog/markdown.py rename yaclog/{cli/version_util.py => version.py} (99%) diff --git a/tests/common.py b/tests/common.py index 9ad0c75..8b9a797 100644 --- a/tests/common.py +++ b/tests/common.py @@ -1,6 +1,9 @@ import datetime import os.path import textwrap + +import changelog +import version import yaclog.changelog log_segments = [ @@ -47,7 +50,7 @@ log_text = '\n\n'.join(log_segments) log = yaclog.Changelog() log.header = '# Changelog\n\nThis changelog is for testing the parser, and has many things in it that might trip it up.' log.links = {'id': 'http://www.koalastothemax.com'} -log.versions = [yaclog.changelog.VersionEntry(), yaclog.changelog.VersionEntry(), yaclog.changelog.VersionEntry()] +log.versions = [changelog.VersionEntry(), changelog.VersionEntry(), changelog.VersionEntry()] log.versions[0].name = '[Tests]' log.versions[0].sections = { diff --git a/tests/test_cli.py b/tests/test_cli.py index 1ee2a33..39287ae 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -2,6 +2,8 @@ import unittest import os.path import git +import changelog +import version import yaclog from yaclog.cli.__main__ import cli from click.testing import CliRunner @@ -64,7 +66,7 @@ class TestTagging(unittest.TestCase): with runner.isolated_filesystem(): in_log = yaclog.Changelog(location) - in_log.versions = [yaclog.changelog.VersionEntry(), yaclog.changelog.VersionEntry()] + in_log.versions = [changelog.VersionEntry(), changelog.VersionEntry()] in_log.versions[0].name = '1.0.0' in_log.versions[1].name = '0.9.0' @@ -92,7 +94,7 @@ class TestTagging(unittest.TestCase): with runner.isolated_filesystem(): in_log = yaclog.Changelog(location) in_log.versions = [None, None] - in_log.versions = [yaclog.changelog.VersionEntry(), yaclog.changelog.VersionEntry()] + in_log.versions = [changelog.VersionEntry(), changelog.VersionEntry()] in_log.versions[0].name = '1.0.0' in_log.versions[0].tags = ['TAG1'] diff --git a/yaclog/changelog.py b/yaclog/changelog.py index caf15da..b4bf0d3 100644 --- a/yaclog/changelog.py +++ b/yaclog/changelog.py @@ -17,49 +17,13 @@ import datetime import os import re -from typing import List, Tuple, Optional, Dict +from typing import List, Optional, Dict -bullets = '+-*' -brackets = '[]' - -code_regex = re.compile(r'^```') -header_regex = re.compile(r'^(?P#+)\s+(?P[^#]+)(?:\s+#+)?$') -under1_regex = re.compile(r'^=+\s*$') -under2_regex = re.compile(r'^-+\s*$') -bullet_regex = re.compile(r'^[-+*]') -linkid_regex = re.compile(r'^\[(?P\S*)]:\s*(?P.*)') +import markdown default_header = '# Changelog\n\nAll notable changes to this project will be documented in this file' -def _strip_link(token): - if link_literal := re.fullmatch(r'\[(.*?)]\((.*?)\)', token): - # in the form [name](link) - return link_literal[1], link_literal[2], None - - if link_id := re.fullmatch(r'\[(.*?)]\[(.*?)]', token): - # in the form [name][id] where id is hopefully linked somewhere else in the document - return link_id[1], None, link_id[2].lower() - - return token, None, None - - -def _join_markdown(segments: List[str]) -> str: - text: List[str] = [] - last_bullet = False - for segment in segments: - is_bullet = bullet_regex.match(segment) - - if not is_bullet or not last_bullet: - text.append('') - - text.append(segment) - - last_bullet = is_bullet - - return '\n'.join(text).strip() - - class VersionEntry: """Holds a single version entry in a :py:class:`Changelog`""" @@ -73,16 +37,32 @@ class VersionEntry: :param date: When the version was released :param tags: The version's tags :param link: The version's URL - :param link_id: The version's link ID, uses the version name by default when writing + :param link_id: The version's link ID """ self.name: str = name + """The version's name""" + self.date: Optional[datetime.date] = date + """WHen the version was released""" + self.tags: List[str] = tags if tags else [] + """The version's tags""" + self.link: Optional[str] = link + """The version's URL""" + self.link_id: Optional[str] = link_id - self.line_no: int = -1 + """The version's link ID, uses the version name by default when writing""" + + self.line_no: Optional[int] = None + """What line the version occurs at in the file, or None if the version was not read from a file. + This is not guaranteed to be correct after the changelog has been modified, + and it has no effect on the written file""" + self.sections: Dict[str, List[str]] = {'': []} + """The dictionary of change entries in the version, organized by section. + Uncategorized changes have a section of an empty string.""" def add_entry(self, contents: str, section: str = '') -> None: """ @@ -118,7 +98,7 @@ class VersionEntry: if len(entries) > 0: segments += entries - return _join_markdown(segments) + return markdown.join(segments) def header(self, md: bool = True) -> str: """ @@ -167,6 +147,8 @@ class VersionEntry: class Changelog: + """A changelog made up of a header, several versions, and a link table""" + def __init__(self, path=None, header: str = default_header): """ Create a new changelog object. Contents will be automatically read from disk if the file exists @@ -174,10 +156,17 @@ class Changelog: :param path: The changelog's path on disk :param header: The header at the top of the changelog to use if the file does not exist """ - self.path = path + self.path = os.path.abspath(path) if path else None + """The path of the changelog's file on disk""" + self.header: str = header + """Any text at the top of the changelog before any H2s""" + self.versions: List[VersionEntry] = [] + """A list of versions in the changelog""" + self.links = {} + """Link IDs at the end of the changelog""" if path and os.path.exists(path): self.read() @@ -195,70 +184,15 @@ class Changelog: # Read file with open(path, 'r') as fp: - lines = fp.readlines() + tokens, self.links = markdown.tokenize(fp.read()) section = '' - in_block = False - in_code = False - - segments: List[Tuple[int, List[str], str]] = [] header_segments = [] - for line_no, line in enumerate(lines): - if in_code: - # this is the contents of a code block - segments[-1][1].append(line) - if code_regex.match(line): - in_code = False - in_block = False + for token in tokens: + text = '\n'.join(token.lines) - elif code_regex.match(line): - # this is the start of a code block - in_code = True - segments.append((line_no, [line], 'code')) - - elif under1_regex.match(line) and in_block and len(segments[-1][1]) == 1 and segments[-1][2] == 'p': - # this is an underline for a setext-style H1 - # ugly but it works - last = segments.pop() - segments.append((last[0], last[1] + [line], 'h1')) - - elif under2_regex.match(line) and in_block and len(segments[-1][1]) == 1 and segments[-1][2] == 'p': - # this is an underline for a setext-style H2 - # ugly but it works - last = segments.pop() - segments.append((last[0], last[1] + [line], 'h2')) - - elif bullet_regex.match(line): - in_block = True - segments.append((line_no, [line], 'li')) - - elif match := header_regex.match(line): - # this is a header - kind = f'h{len(match["hashes"])}' - segments.append((line_no, [line], kind)) - in_block = False - - elif match := linkid_regex.match(line): - # this is a link definition in the form '[id]: link', so add it to the link table - self.links[match['link_id'].lower()] = match['link'] - - elif line.isspace(): - # skip empty lines - in_block = False - - elif in_block: - # this is a line to be added to a paragraph - segments[-1][1].append(line) - else: - # this is a new paragraph - in_block = True - segments.append((line_no, [line], 'p')) - - for segment in segments: - text = ''.join(segment[1]).strip() - - if segment[2] == 'h2': + if token.kind == 'h2': # start of a version slug = text.rstrip('-').strip('#').strip() @@ -270,7 +204,7 @@ class Changelog: section = '' version.name = slug - version.line_no = segment[0] + version.line_no = token.line_no tags = [] date = None @@ -288,7 +222,7 @@ class Changelog: else: # matches the schema - version.name, version.link, version.link_id = _strip_link(split[0]) + version.name, version.link, version.link_id = markdown.strip_link(split[0]) version.date = date version.tags = tags @@ -299,7 +233,7 @@ class Changelog: # so its best to just add this line to the header string header_segments.append(text) - elif segment[2] == 'h3': + elif token.kind == 'h3': # start of a version section section = text.strip('#').strip() if section not in self.versions[-1].sections.keys(): @@ -324,7 +258,7 @@ class Changelog: version.link = self.links[version.link_id] # strip whitespace from header - self.header = _join_markdown(header_segments) + self.header = markdown.join(header_segments) def write(self, path: os.PathLike = None) -> None: """ @@ -348,7 +282,19 @@ class Changelog: segments += [f'[{link_id}]: {link}' for link_id, link in v_links.items()] - text = _join_markdown(segments) + text = markdown.join(segments) with open(path, 'w') as fp: fp.write(text) + + def add_version(self, index: int = 0, *args, **kwargs) -> VersionEntry: + version = VersionEntry(*args, **kwargs) + self.versions.insert(index, version) + + return version + + def current(self, new_version_name='Unreleased') -> VersionEntry: + if len(self.versions) == 0: + return self.add_version(name=new_version_name) + + return self.versions[0] diff --git a/yaclog/cli/__main__.py b/yaclog/cli/__main__.py index e9558a6..6fb4066 100644 --- a/yaclog/cli/__main__.py +++ b/yaclog/cli/__main__.py @@ -18,7 +18,9 @@ import click import os.path import datetime import git -import yaclog.cli.version_util + +import changelog +import yaclog.version from yaclog import Changelog @@ -141,7 +143,7 @@ def entry(obj: Changelog, bullets, paragraphs, section_name, version_name): else: matches = [v for v in obj.versions if v.name.lower() == 'unreleased'] if len(matches) == 0: - version = yaclog.changelog.VersionEntry() + version = changelog.VersionEntry() obj.versions.insert(0, version) else: version = matches[0] @@ -192,11 +194,11 @@ def release(obj: Changelog, v_flag, commit): if v_flag: if v_flag[0] == '+': - new_name = yaclog.cli.version_util.increment_version(version, v_flag) + new_name = yaclog.version.increment_version(version, v_flag) else: new_name = v_flag - if yaclog.cli.version_util.is_release(cur_version.name): + if yaclog.version.is_release(cur_version.name): click.confirm(f'Rename release version "{cur_version.name}" to "{new_name}"?', abort=True) cur_version.name = new_name @@ -213,7 +215,7 @@ def release(obj: Changelog, v_flag, commit): repo.index.add(obj.path) - version_type = '' if yaclog.cli.version_util.is_release(cur_version.name) else 'non-release ' + version_type = '' if yaclog.version.is_release(cur_version.name) else 'non-release ' tracked = len(repo.index.diff(repo.head.commit)) tracked_warning = 'Create tag' untracked = len(repo.index.diff(None)) diff --git a/yaclog/markdown.py b/yaclog/markdown.py new file mode 100644 index 0000000..d854490 --- /dev/null +++ b/yaclog/markdown.py @@ -0,0 +1,153 @@ +# yaclog: yet another changelog tool +# Copyright (c) 2021. Andrew Cassidy +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +import re +from typing import List + +bullets = '+-*' +brackets = '[]' +code_regex = re.compile(r'^```') +header_regex = re.compile(r'^(?P#+)\s+(?P[^#]+)(?:\s+#+)?$') +li_regex = re.compile(r'^[-+*] |\d+\. ') +numbered_regex = re.compile(r'^\d+\. ') +bullet_regex = re.compile(r'^[-+*] ') +link_id_regex = re.compile(r'^\[(?P\S*)]:\s*(?P.*)') +link_def_regex = re.compile(r'\[(?P.*?)]\[(?P.*?)]') # deferred link in the form [name][id] +link_lit_regex = re.compile(r'\[(?P.*?)]\((?P.*?)\)') # literal link in the form [name](url) + +setext_h1_replace_regex = re.compile(r'(?<=\n)(?P
[^\n]+?)\n=+[ \t]*(?=\n)') +setext_h2_replace_regex = re.compile(r'(?<=\n)(?P
[^\n]+?)\n-+[ \t]*(?=\n)') + + +def strip_link(token): + """ + Parses and removes any links from the token + + :param token: An input token which may be a markdown link, either literal or an ID + :return: A tuple of (name, url, id) + """ + + if link_lit := link_lit_regex.fullmatch(token): + # in the form [name](link) + return link_lit['text'], link_lit['link'], None + + if link_def := link_def_regex.fullmatch(token): + # in the form [name][id] where id is hopefully linked somewhere else in the document + return link_def['text'], None, link_def['link_id'].lower() + + return token, None, None + + +def join(segments: List[str]) -> str: + """ + Joins multiple lines of markdown by adding double newlines between them, or a single newline between list items + + :param segments: A list of strings to join + :return: A joined markdown string + """ + + text: List[str] = [] + last_segment = '' + for segment in segments: + if bullet_regex.match(segment) and bullet_regex.match(last_segment): + pass + elif numbered_regex.match(segment) and numbered_regex.match(last_segment): + pass + else: + text.append('') + + text.append(segment) + + last_segment = segment + + return '\n'.join(text).strip() + + +class Token: + def __init__(self, line_no: int, lines: List[str], kind: str): + self.line_no = line_no + self.lines = lines + self.kind = kind + + def __str__(self): + return f'{self.kind}: {self.lines}' + + +def tokenize(text: str): + """ + Tokenize a markdown string + + The tokenizer is very basic, and only cares about the highest-level blocks + (Headers, top-level list items, links, code blocks, paragraphs). + + :param text: input text to tokenize + :return: A list of tokens + """ + + # convert setext-style headers + # The extra newline is to preserve line numbers + text = setext_h1_replace_regex.sub(r'# \g
\n', text) + text = setext_h2_replace_regex.sub(r'## \g
\n', text) + + lines = text.split('\n') + tokens: List[Token] = [] + links = {} + + # state variables for parsing + block = None + + for line_no, line in enumerate(lines): + if block == 'code': + # this is the contents of a code block + assert block == tokens[-1].kind, 'block state variable in invalid state!' + tokens[-1].lines.append(line) + if code_regex.match(line): + block = None + + elif code_regex.match(line): + # this is the start of a code block + tokens.append(Token(line_no, [line], 'code')) + block = 'code' + + elif li_regex.match(line): + # this is a list item + tokens.append(Token(line_no, [line], 'li')) + block = 'li' + + elif match := header_regex.match(line): + # this is a header + kind = f'h{len(match["hashes"])}' + tokens.append(Token(line_no, [line], kind)) + + elif match := link_id_regex.match(line): + # this is a link definition in the form '[id]: link' + links[match['link_id'].lower()] = match['link'] + block = None + + elif not line or line.isspace(): + # skip empty lines and reset block + block = None + + elif block: + # this is a line to be added to a paragraph or list item + assert block == tokens[-1].kind, f'block state variable in invalid state! {block} != {tokens[-1].kind}' + tokens[-1].lines.append(line) + + else: + # this is a new paragraph + tokens.append(Token(line_no, [line], 'p')) + block = 'p' + + return tokens, links diff --git a/yaclog/cli/version_util.py b/yaclog/version.py similarity index 99% rename from yaclog/cli/version_util.py rename to yaclog/version.py index f712b34..bbb2762 100644 --- a/yaclog/cli/version_util.py +++ b/yaclog/version.py @@ -81,3 +81,5 @@ def join_version(epoch, release, pre, post, dev, local) -> str: parts.append(f"+{local}") return "".join(parts) + +