Last active
July 13, 2021 12:00
-
-
Save reagle/dad4e59df7b73ba935556ac9638c1703 to your computer and use it in GitHub Desktop.
Wrap text, including semantically by add breaks at terminal punctuation.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2.7 | |
"""Wrap text, including semantically by add breaks at | |
terminal punctuation.""" | |
# Using python 2.7 so portions can be used in Sublime Text 3 plugin | |
import argparse # http://docs.python.org/dev/library/argparse.html | |
import logging | |
import re | |
import sys | |
import textwrap | |
debug = logging.debug | |
info = logging.info | |
warn = logging.warn | |
error = logging.error | |
critical = logging.critical | |
exception = logging.exception | |
SEMANTIC_BREAK_RE = re.compile( | |
r""" | |
( # end of sentence includes... | |
[a-z]{2,}| # end of word | |
[0-9]{1,} # end of a page or chapter number | |
) | |
( # terminal punctuation | |
\. | |
|\? | |
|\! | |
|\} | |
|: | |
|\.\) # period paren | |
|\)\. # paren period | |
|\.\] # period bracket | |
|\]\. # bracket period | |
|\." # period quote | |
|"\. # quote quote | |
|"\)\. # quote paren period | |
|\?\) # question paren | |
|\)\? # paren question | |
|\?\] # question bracket | |
|\]\? # bracket question | |
|\?" # question quote | |
|"\? # quote question | |
|\!\) # exclaim paren | |
|\)\! # paren exclaim | |
|\!\] # exclaim bracket | |
|\]\! # bracket exclaim | |
|\!" # exclaim quote | |
|"\! # quote exclaim | |
) | |
(\s) # a whitespace | |
(?!\d) # negative lookahead for digit | |
""", | |
re.VERBOSE, | |
) | |
QUOTES_RE = re.compile( | |
r""" | |
(^[> ]+)? # start quotes | |
(.*) # rest of line | |
""", | |
re.VERBOSE, | |
) | |
def quoted_wrap(content, width): | |
"""wrap quoted lines, preserving quotes; collapse empty lines""" | |
wrapper = textwrap.TextWrapper(width=width) | |
quote_level = quote_level_prev = None | |
buffer = [ | |
"", | |
] # hold chunks of similar quote_level | |
new_content = [] # hold rebuilt content | |
for line in content.split("\n"): | |
# quotes = text = "" | |
info("---") | |
info("line = '%s'" % line) | |
quotes, text = QUOTES_RE.match(line).groups() | |
quote_level = quotes.count(">") if quotes else 0 | |
if quote_level_prev is None: # None on first iteration | |
info("* no quote_level_prev") | |
quote_level_prev = quote_level | |
info("text = '%s'" % (text)) | |
info( | |
"quote_level = %s quote_level_prev = %s" | |
% (quote_level, quote_level_prev) | |
) | |
# if similar: accumulate lines of paragraph at same quote level | |
if quote_level == quote_level_prev and text: | |
buffer.append(text) | |
info("buffer = %s" % (buffer)) | |
# else: add wrapped lines to new_content, start new buffer | |
else: | |
info("* change in quote level or new paragraph") | |
info("buffer = %s" % (buffer)) | |
prefix = ">" * quote_level_prev + " " if quote_level_prev else "" | |
wrapper.initial_indent = prefix | |
wrapper.subsequent_indent = prefix | |
buffer_joined = " ".join(buffer).strip() | |
info("buffer_joined = '%s'" % (buffer_joined)) | |
new_content.append(wrapper.fill(buffer_joined)) | |
if not text: # new paragraph | |
info("* new paragraph") | |
new_content.append(prefix) | |
quote_level_prev = quote_level | |
buffer = [text] | |
info("NEW_CONTENT = %s" % (new_content)) | |
new_content = "\n".join(new_content) # TODO: remove first empty | |
new_content = re.sub(r"\n\s*\n", "\n\n", new_content) # collapse empty | |
return new_content | |
def semantic_wrap(text): | |
"""wrap quoted lines at terminal punctuations. | |
...split('\n\n')...startswith('>') doesn't catch a non-quoted line | |
followed by quoted lines of changing levels | |
(e.g., beginning of email: "On DATE, NAME wrote:"). | |
""" | |
new_content = [] | |
info("START") | |
for line in re.split("\n\n", text): | |
info("line = '%s'" % line) | |
if line.startswith(">"): | |
info("is quoted") | |
quotes, line = QUOTES_RE.match(line).groups() | |
line = line.replace(quotes, "") | |
wrapped_line = SEMANTIC_BREAK_RE.sub( | |
r"""\1\2\3\n%s""" % quotes, line | |
) | |
info("wrapped_line = '%s'" % wrapped_line) | |
info("quotes = '%s'" % quotes) | |
wrapped_line = quotes + wrapped_line + "\n" | |
else: | |
wrapped_line = SEMANTIC_BREAK_RE.sub(r"""\1\2\3\n""", line) + "\n" | |
new_content.append(wrapped_line) | |
return "\n".join(new_content) | |
def main(argv): | |
"""Process arguments""" | |
arg_parser = argparse.ArgumentParser( | |
description="a couple different types of line wrappers" | |
) | |
# positional arguments | |
arg_parser.add_argument("files", nargs=1, metavar="FILE") | |
# optional arguments | |
arg_parser.add_argument( | |
"-s", | |
"--semantic", | |
action="store_true", | |
default=False, | |
help="semantic wrap (at sentence endings)", | |
) | |
arg_parser.add_argument( | |
"-w", | |
"--wrap", | |
type=int, | |
default=72, | |
help="wrap width (default: %(default)s)", | |
) | |
arg_parser.add_argument( | |
"-o", | |
"--out-filename", | |
help="output results to filename", | |
metavar="FILE", | |
) | |
arg_parser.add_argument( | |
"-L", | |
"--log-to-file", | |
action="store_true", | |
default=False, | |
help="log to file %(prog)s.log", | |
) | |
arg_parser.add_argument( | |
"-V", | |
"--verbose", | |
action="count", | |
default=0, | |
help="Increase verbosity (specify multiple times for more)", | |
) | |
arg_parser.add_argument("--version", action="version", version="TBD") | |
args = arg_parser.parse_args(argv) | |
log_level = 100 # default | |
if args.verbose >= 3: | |
log_level = logging.DEBUG # 10 | |
elif args.verbose == 2: | |
log_level = logging.INFO # 20 | |
elif args.verbose == 1: | |
log_level = logging.ERROR # 40 | |
LOG_FORMAT = "%(levelno)s %(funcName).5s: %(message)s" | |
if args.log_to_file: | |
logging.basicConfig( | |
filename="wrap.log", | |
filemode="w", | |
level=log_level, | |
format=LOG_FORMAT, | |
) | |
else: | |
logging.basicConfig(level=log_level, format=LOG_FORMAT) | |
return args | |
if "__main__" == __name__: | |
args = main(sys.argv[1:]) | |
content = open(args.files[0]).read() | |
if args.semantic: | |
print(semantic_wrap(content)) | |
else: | |
print(quoted_wrap(content, args.wrap)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment