Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions lexos/helpers/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@
MUFI_3_FILENAME = "MUFI_3_DICT.tsv"
MUFI_4_FILENAME = "MUFI_4_DICT.tsv"
STOPWORD_FILENAME = "stopwords.p"
# CONSOLIDATION_FILENAME = "consolidations.p"
PATTERN_REPLACEMENTS_FILENAME = "pattern_replacements.p"
LEMMA_FILENAME = "lemmas.p"
CONSOLIDATION_FILENAME = "consolidations.p"
SPECIAL_CHAR_FILENAME = "specialchars.p"
DIGIT_MAP_FILENAME = "digitmap.p"
PUNCTUATION_MAP_FILENAME = "punctuationmap.p"
Expand Down Expand Up @@ -84,14 +85,14 @@
SCRUBINPUTS = (
"stop_words",
"special_characters",
"consolidations",
"pattern_replacements",
"lemmas"
)

OPTUPLOADNAMES = (
"stop_words_file[]",
"lemmas_file[]",
"consolidations_file[]",
"pattern_replacements_file[]",
"special_characters_file[]"
)

Expand Down Expand Up @@ -209,13 +210,13 @@
"stop_words": "",
"stop_words_method": "Off",
"special_characters": "",
"consolidations": "",
"pattern_replacements": "",
"lemmas": "",
"special_characters_preset": "None",
"file_uploads": {
"stop_words_file[]": "",
"lemmas_file[]": "",
"consolidations_file[]": "",
"pattern_replacements[]": "",
"special_characters_file[]": ""}}

DEFAULT_CUT_OPTIONS = {
Expand Down
3 changes: 3 additions & 0 deletions lexos/helpers/error_messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@
"Too many values on right side of replacement string."
REPLACEMENT_NO_LEFT_HAND_MESSAGE = \
"Missing value on the left side of replacement string."
UNESCAPED_GREATER_THAN_SIGN_MESSAGE = "Please place a backslash before all" \
" examples of `>` in the pattern you wish to" \
" replace."
# ----------------------------------------------------------------------------


Expand Down
117 changes: 90 additions & 27 deletions lexos/processors/prepare/scrubber.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
from lexos.helpers import constants as constants, \
general_functions as general_functions
from lexos.helpers.error_messages import NOT_ONE_REPLACEMENT_COLON_MESSAGE, \
REPLACEMENT_RIGHT_OPERAND_MESSAGE, REPLACEMENT_NO_LEFT_HAND_MESSAGE
REPLACEMENT_RIGHT_OPERAND_MESSAGE, REPLACEMENT_NO_LEFT_HAND_MESSAGE, \
UNESCAPED_GREATER_THAN_SIGN_MESSAGE
from lexos.helpers.exceptions import LexosException


Expand Down Expand Up @@ -192,6 +193,64 @@ def replacement_handler(text: str,
return text


def pattern_replacement_handler(text: str,
replacer_string: str) -> str:
"""Handle pattern replacement lines found in scrub-alteration-upload files.

:param text: A unicode string with the whole text to be altered.
:param replacer_string: A formatted string input with newline-separated
"replacement lines", where each line is formatted to replace the
majority of the words with one word.
:returns: The input string with replacements made.
"""

# Convert HTML character entities to Unicode if HTML is selected *and*
# there are further entities entered in the form field
if request.form['special_characters_preset'] == 'HTML':
text = html.unescape(text)

# Remove spaces in replacement string for consistent format, then split the
# individual replacements to be made
# Not sure if this is needed
no_space_replacer = replacer_string.translate({ord(" "): None}).strip('\n')

# Handle excess blank lines in file, etc.
replacement_lines = [token for token in no_space_replacer.split('\n')
if token != ""]

replacement_jobs = []
# Search for all examples of > not preceded by a backslash
pat_for_sep = re.compile(r'(?<!\\)>')
for replacement_line in replacement_lines:
# There is more than one potential separator, raise an error
if len(re.findall(pat_for_sep, replacement_line)) > 1:
raise LexosException(
UNESCAPED_GREATER_THAN_SIGN_MESSAGE + replacement_line)
# Otherwise, define a replacement tuple
else:
# Remove whitespace around the separator and then split
replacement_line = re.sub(r'\s+>\s+', '>', replacement_line)
pattern, substitution = re.split(pat_for_sep, replacement_line)
# Handle string internal greater than sign
substitution = substitution.replace('\\>', '>')
# Convert \s token to a space
substitution = substitution.replace('\\s', ' ')
# If the pattern has the prefix REGEX:, remove it,
# and set regex=True
if pattern.lower().startswith('regex:'):
regex = True
pattern = re.sub(r'^REGEX:', '', pattern, flags=re.IGNORECASE)
else:
regex = False
replacement_jobs.append((regex, pattern, substitution))
# Do the replacement
if regex is True:
text = re.sub(pattern, substitution, text)
else:
text = text.replace(pattern, substitution)
return text


def replace_with_dict(text: str, replacement_dict: Dict[str, str],
edge1: str, edge2: str) -> str:
"""Alters text according to the replacements dictionary.
Expand Down Expand Up @@ -813,9 +872,9 @@ def prepare_additional_options(opt_uploads: Dict[str, FileStorage],
option text fields and files.
"""

file_strings = {'consolidations_file[]': '', 'lemmas_file[]': '',
file_strings = {'pattern_replacements_file[]': '', 'lemmas_file[]': '',
'special_characters_file[]': '', 'stop_words_file[]': '',
'consolidations': '', 'lemmas': '',
'pattern_replacements': '', 'lemmas': '',
'special_characters': '', 'stop_words': ''}

for index, key in enumerate(sorted(opt_uploads)):
Expand All @@ -831,14 +890,15 @@ def prepare_additional_options(opt_uploads: Dict[str, FileStorage],
file_strings[key] = ""

# Create an array of option strings:
# cons_file_string, lem_file_string, sc_file_string, sw_kw_file_string,
# cons_manual, lem_manual, sc_manual, and sw_kw_manual
# pat_replacements_file_string, lem_file_string, sc_file_string,
# sw_kw_file_string, pattern_replacements_manual, lem_manual, sc_manual,
# and sw_kw_manual

all_options = [file_strings.get('consolidations_file[]'),
all_options = [file_strings.get('pattern_replacements_file[]'),
file_strings.get('lemmas_file[]'),
file_strings.get('special_characters_file[]'),
file_strings.get('stop_words_file[]'),
request.form['consolidations'],
request.form['pattern_replacements'],
request.form['lemmas'],
request.form['special_characters'],
request.form['stop_words']]
Expand Down Expand Up @@ -884,33 +944,34 @@ def scrub(text: str, gutenberg: bool, lower: bool, punct: bool, apos: bool,

storage_filenames = sorted(
[constants.STOPWORD_FILENAME, constants.LEMMA_FILENAME,
constants.CONSOLIDATION_FILENAME, constants.SPECIAL_CHAR_FILENAME])
constants.PATTERN_REPLACEMENTS_FILENAME,
constants.SPECIAL_CHAR_FILENAME])
option_strings = prepare_additional_options(
opt_uploads, storage_options, storage_folder, storage_filenames)

# handle uploaded FILES: consolidations, lemmas, special characters,
# handle uploaded FILES: pattern_replacements, lemmas, special characters,
# stop-keep words
cons_file_string = option_strings[0]
pat_replacements_file_string = option_strings[0]
lem_file_string = option_strings[1]
sc_file_string = option_strings[2]
sw_kw_file_string = option_strings[3]

# handle manual entries: consolidations, lemmas, special characters,
# handle manual entries: pattern_replacements, lemmas, special characters,
# stop-keep words
cons_manual = option_strings[4]
pattern_replacements_manual = option_strings[4]
lem_manual = option_strings[5]
sc_manual = option_strings[6]
sw_kw_manual = option_strings[7]

# Scrubbing order:
#
# Note: lemmas and consolidations do NOT work on tags; in short,
# Note: lemmas and pattern_replacements do NOT work on tags; in short,
# these manipulations do not change inside any tags
#
# 0. Gutenberg
# 1. lower
# (not applied in tags ever;
# lemmas/consolidations/specialChars/stopKeepWords changed;
# lemmas/pattern_replacements/specialChars/stopKeepWords changed;
# text not changed at this point)
# 2. special characters
# 3. tags - scrub tags
Expand All @@ -920,7 +981,7 @@ def scrub(text: str, gutenberg: bool, lower: bool, punct: bool, apos: bool,
# 5. digits (text not changed at this point, not applied in tags ever)
# 6. white space (text not changed at this point, not applied in tags ever,
# otherwise tag attributes will be messed up)
# 7. consolidations
# 7. pattern_replacements
# (text not changed at this point, not applied in tags ever)
# 8. lemmatize (text not changed at this point, not applied in tags ever)
# 9. stop words/keep words
Expand All @@ -929,7 +990,7 @@ def scrub(text: str, gutenberg: bool, lower: bool, punct: bool, apos: bool,
# apply:
# 0. remove Gutenberg boiler plate (if any)
# 1. lowercase
# 2. consolidation
# 2. pattern_replacements
# 3. lemmatize
# 4. stop words
# 5. remove punctuation, digits, and whitespace without changing all the
Expand All @@ -956,13 +1017,13 @@ def to_lower_function(orig_text: str) -> str:

# since lower is ON, apply lowercase to other options
# apply to contents of any uploaded files
cons_file_string = cons_file_string.lower()
pat_replacements_file_string = pat_replacements_file_string.lower()
lem_file_string = lem_file_string.lower()
sc_file_string = sc_file_string.lower()
sw_kw_file_string = sw_kw_file_string.lower()

# apply to contents manually entered
cons_manual = cons_manual.lower()
pattern_replacements_manual = pattern_replacements_manual.lower()
lem_manual = lem_manual.lower()
sc_manual = sc_manual.lower()
sw_kw_manual = sw_kw_manual.lower()
Expand Down Expand Up @@ -1039,21 +1100,23 @@ def total_removal_function(orig_text: str) -> str:
"""
return orig_text.translate(total_removal_map)

# -- 7. consolidations ---------------------------------------------------
def consolidation_function(orig_text: str) -> str:
# -- 7. pattern_replacements --------------------------------------------
def pattern_replacements_function(orig_text: str) -> str:
"""Replaces characters according to user input strings.

:param orig_text: A text string.
:return: The text with characters swapped according to cons_file_string
and cons_manual.
:return: The text with characters swapped according to
pat_replacements_file_string and pattern_replacements_manual.
"""

replacer_string = handle_file_and_manual_strings(
file_string=cons_file_string, manual_string=cons_manual,
storage_folder=storage_folder, storage_filenames=storage_filenames,
file_string=pat_replacements_file_string,
manual_string=pattern_replacements_manual,
storage_folder=storage_folder,
storage_filenames=storage_filenames,
storage_number=0)
text = replacement_handler(
text=orig_text, replacer_string=replacer_string, is_lemma=False)
text = pattern_replacement_handler(
text=orig_text, replacer_string=replacer_string)
return text

# -- 8. lemmatize --------------------------------------------------------
Expand Down Expand Up @@ -1106,7 +1169,7 @@ def stop_keep_words_function(orig_text: str) -> str:

# apply all the functions and exclude tag
functions = [to_lower_function,
consolidation_function,
pattern_replacements_function,
lemmatize_function,
total_removal_function,
stop_keep_words_function]
Expand Down
90 changes: 39 additions & 51 deletions lexos/static/help/scrub-help.html
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ <h3 class="help-section-paragraph">
Converts all uppercase characters to lowercase characters so that the
tokens "The" and "the" will be considered as the same term. In addition,
all contents (whether in uploaded files or entered manually) for the
Stop Words/Keep Words, Lemmas, Consolidations, or Special Characters
Stop Words/Keep Words, Pattern Replacements, Lemmas, or Special Characters
options will also have all uppercase characters changed to lowercase.
Lowercase is not applied inside any <code>HTML</code>, <code>XML</code>,
or <code>SGML</code> markup tags remaining in the text.
Expand Down Expand Up @@ -212,6 +212,44 @@ <h3 class="help-section-paragraph">
<a target="_blank" href="https://github.com/WheatonCS/Lexos/tree/master/test/test_suite/Scrubbing/Stop%20Words%20%26%20Keep%20Words"><u>click here</u></a>
</h3>

<li><b>Pattern Replacements</b></li>
<h3 class="help-section-paragraph">
Replaces a list of character patterns with substitutions. This is
typically to consolidate symbols considered equivalent or to provide
highly specific cleaning functions.
</h3>
<h3 class="help-section-paragraph">
For example, in Old English, the character "eth" <em>ð</em> is interchangeable
with the character "thorn" <em>þ</em>. The Pattern Replacements option allows you to
choose to merge the two using a single character.
</h3>
<h3 class="help-section-paragraph">
Pattern replacements should be entered in the format: <pre>ð > þ</pre> Where you wish to
change all occurrences of <code>ð</code> to <code>þ</code>. Multiple
consolidations can be separated by commas or line breaks. All replacements will be
applied in the order in which they are listed.
</h3>
<h3 class="help-section-paragraph">
If you find that you need to replace the <code>&gt;</code> character itself, place a backslash
before it like <code>\></code>. To replace a character with a space, use <code>\s</code>. For
instance, the pattern <code>- > \s</code> will replace hyphens with spaces.
</h3>
<h3 class="help-section-paragraph">
Lexos accepts regular expression (regex) patterns if the pattern begins with <code>REGEX:</code>.
For example, <code>REGEX:^c > k</code> will change <em>c</em> to <em>k</em> only at the beginning
of the text. If you are using regex capture groups, you can reference them with <code>\1</code>,
<code>\2</code>, etc. For example, the pattern <code>REGEX:(hi)-(ho) > \2-\1</code> will change
"hi-ho" to "ho-hi". A useful regular expressions tutorial can be found at
<a target="_blank" href="https://regexone.com/"><u>RegexOne</u></a>. Regex patterns can also be
tested at <a target="_blank" href="https://regex101.com/">Regex101</a>.
</h3>
<h3 class="help-section-paragraph">
Pattern replacements can be entered manually in the provided form field or
uploaded from a file. Note that the "Make Lowercase" option will be applied
to your list of characters if that option is also selected. To replace
entire words (terms) with other words, you should use the <b>Lemma</b> option.
</h3>

<li><b>Lemmas</b></li>
<h3 class="help-section-paragraph">
Replaces all instances of terms in a list with a common replacement
Expand All @@ -234,28 +272,6 @@ <h3 class="help-section-paragraph">
You can manually enter a list of lemmas or upload your own.
</h3>

<li><b>Consolidations</b></li>
<h3 class="help-section-paragraph">
Replaces a list of characters with a different character. This is
typically to consolidate symbols considered equivalent.
</h3>
<h3 class="help-section-paragraph">
For example, in Old English, the character "eth" ð is interchangeable
with the character "thorn" þ. The Consolidations option allows you to
choose to merge the two using a single character.
</h3>
<h3 class="help-section-paragraph">
Consolidations should be entered in the format: <pre>ð: þ</pre> Where you wish to
change all occurrences of <code>ð</code> to <code>þ</code> . Multiple
consolidations can be separated by commas or line breaks.
</h3>
<h3 class="help-section-paragraph">
Consolidations can be entered manually in the provided form field or
uploaded from a file. Note that the "Make Lowercase" option will be applied
to your list of characters if that option is also selected. To replace
entire words (terms) with other words, you should use the <b>Lemma</b> option.
</h3>

<li><b>Special Characters</b></li>
<h3 class="help-section-paragraph">
Replaces character entities with their glyph equivalents.
Expand Down Expand Up @@ -287,31 +303,3 @@ <h3 class="help-section-paragraph">
Multiple transformation rules should be listed on separate lines.
</h3>
</ul>

<!-- Replacing Patterns -->
<h3 class="help-section-title">Replacing Patterns</h3>

<h3 class="help-section-paragraph">
Sometimes it is necessary to replace a pattern rather than a precise string.
For instance, if a document contains multiple URLs like
<code>http://lexos.wheatoncollege.edu</code> and
<code>http://scalar.usc.edu/works/lexos/</code>, and you need to strip
these URLs, a method is required for matching all URLs without knowing
what they are in advance. One technique is to apply regular expression
(regex) pattern matching. Lexos uses regular expressions internally to
perform some of its scrubbing options, but, as of version 3.0, it does
not provide a way for users to supply their own regular expression
patterns when scrubbing. If users need to strip or replace patterns by
regular expressions, it will be necessary to perform that action using
a separate script or tool <em><u>prior to using Lexos</u></em>. A useful
regular expressions tutorial can be found at
<a target="_blank" href="https://regexone.com/"><u>RegexOne</u></a>.
Most modern text editors like
<a target="_blank" href="https://www.sublimetext.com/"><u>Sublime Text</u></a> and
<a target="_blank" href="http://www.barebones.com/products/TextWrangler/"><u>TextWrangler</u></a>
accept regular expressions in their search and replace functions, and
users may find them to be a convenient means of performing actions with
regular expressions. We hope to add a regular expression pattern matching
to Lexos in the future.
</h3>

Loading