Refactor changelog class and make tokenizer seperate
parent
1676b28f03
commit
c09df3a770
@ -0,0 +1,153 @@
|
||||
# yaclog: yet another changelog tool
|
||||
# Copyright (c) 2021. Andrew Cassidy
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as
|
||||
# published by the Free Software Foundation, either version 3 of the
|
||||
# License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
import re
|
||||
from typing import List
|
||||
|
||||
bullets = '+-*'
|
||||
brackets = '[]'
|
||||
code_regex = re.compile(r'^```')
|
||||
header_regex = re.compile(r'^(?P<hashes>#+)\s+(?P<contents>[^#]+)(?:\s+#+)?$')
|
||||
li_regex = re.compile(r'^[-+*] |\d+\. ')
|
||||
numbered_regex = re.compile(r'^\d+\. ')
|
||||
bullet_regex = re.compile(r'^[-+*] ')
|
||||
link_id_regex = re.compile(r'^\[(?P<link_id>\S*)]:\s*(?P<link>.*)')
|
||||
link_def_regex = re.compile(r'\[(?P<text>.*?)]\[(?P<link_id>.*?)]') # deferred link in the form [name][id]
|
||||
link_lit_regex = re.compile(r'\[(?P<text>.*?)]\((?P<link>.*?)\)') # literal link in the form [name](url)
|
||||
|
||||
setext_h1_replace_regex = re.compile(r'(?<=\n)(?P<header>[^\n]+?)\n=+[ \t]*(?=\n)')
|
||||
setext_h2_replace_regex = re.compile(r'(?<=\n)(?P<header>[^\n]+?)\n-+[ \t]*(?=\n)')
|
||||
|
||||
|
||||
def strip_link(token):
|
||||
"""
|
||||
Parses and removes any links from the token
|
||||
|
||||
:param token: An input token which may be a markdown link, either literal or an ID
|
||||
:return: A tuple of (name, url, id)
|
||||
"""
|
||||
|
||||
if link_lit := link_lit_regex.fullmatch(token):
|
||||
# in the form [name](link)
|
||||
return link_lit['text'], link_lit['link'], None
|
||||
|
||||
if link_def := link_def_regex.fullmatch(token):
|
||||
# in the form [name][id] where id is hopefully linked somewhere else in the document
|
||||
return link_def['text'], None, link_def['link_id'].lower()
|
||||
|
||||
return token, None, None
|
||||
|
||||
|
||||
def join(segments: List[str]) -> str:
|
||||
"""
|
||||
Joins multiple lines of markdown by adding double newlines between them, or a single newline between list items
|
||||
|
||||
:param segments: A list of strings to join
|
||||
:return: A joined markdown string
|
||||
"""
|
||||
|
||||
text: List[str] = []
|
||||
last_segment = ''
|
||||
for segment in segments:
|
||||
if bullet_regex.match(segment) and bullet_regex.match(last_segment):
|
||||
pass
|
||||
elif numbered_regex.match(segment) and numbered_regex.match(last_segment):
|
||||
pass
|
||||
else:
|
||||
text.append('')
|
||||
|
||||
text.append(segment)
|
||||
|
||||
last_segment = segment
|
||||
|
||||
return '\n'.join(text).strip()
|
||||
|
||||
|
||||
class Token:
|
||||
def __init__(self, line_no: int, lines: List[str], kind: str):
|
||||
self.line_no = line_no
|
||||
self.lines = lines
|
||||
self.kind = kind
|
||||
|
||||
def __str__(self):
|
||||
return f'{self.kind}: {self.lines}'
|
||||
|
||||
|
||||
def tokenize(text: str):
|
||||
"""
|
||||
Tokenize a markdown string
|
||||
|
||||
The tokenizer is very basic, and only cares about the highest-level blocks
|
||||
(Headers, top-level list items, links, code blocks, paragraphs).
|
||||
|
||||
:param text: input text to tokenize
|
||||
:return: A list of tokens
|
||||
"""
|
||||
|
||||
# convert setext-style headers
|
||||
# The extra newline is to preserve line numbers
|
||||
text = setext_h1_replace_regex.sub(r'# \g<header>\n', text)
|
||||
text = setext_h2_replace_regex.sub(r'## \g<header>\n', text)
|
||||
|
||||
lines = text.split('\n')
|
||||
tokens: List[Token] = []
|
||||
links = {}
|
||||
|
||||
# state variables for parsing
|
||||
block = None
|
||||
|
||||
for line_no, line in enumerate(lines):
|
||||
if block == 'code':
|
||||
# this is the contents of a code block
|
||||
assert block == tokens[-1].kind, 'block state variable in invalid state!'
|
||||
tokens[-1].lines.append(line)
|
||||
if code_regex.match(line):
|
||||
block = None
|
||||
|
||||
elif code_regex.match(line):
|
||||
# this is the start of a code block
|
||||
tokens.append(Token(line_no, [line], 'code'))
|
||||
block = 'code'
|
||||
|
||||
elif li_regex.match(line):
|
||||
# this is a list item
|
||||
tokens.append(Token(line_no, [line], 'li'))
|
||||
block = 'li'
|
||||
|
||||
elif match := header_regex.match(line):
|
||||
# this is a header
|
||||
kind = f'h{len(match["hashes"])}'
|
||||
tokens.append(Token(line_no, [line], kind))
|
||||
|
||||
elif match := link_id_regex.match(line):
|
||||
# this is a link definition in the form '[id]: link'
|
||||
links[match['link_id'].lower()] = match['link']
|
||||
block = None
|
||||
|
||||
elif not line or line.isspace():
|
||||
# skip empty lines and reset block
|
||||
block = None
|
||||
|
||||
elif block:
|
||||
# this is a line to be added to a paragraph or list item
|
||||
assert block == tokens[-1].kind, f'block state variable in invalid state! {block} != {tokens[-1].kind}'
|
||||
tokens[-1].lines.append(line)
|
||||
|
||||
else:
|
||||
# this is a new paragraph
|
||||
tokens.append(Token(line_no, [line], 'p'))
|
||||
block = 'p'
|
||||
|
||||
return tokens, links
|
Loading…
Reference in New Issue