WheatonCS · scottkleinman · Jun 9, 2020 · Jun 9, 2020 · Jun 10, 2020 · Jun 10, 2020
diff --git a/lexos/helpers/constants.py b/lexos/helpers/constants.py
@@ -31,8 +31,9 @@
 MUFI_3_FILENAME = "MUFI_3_DICT.tsv"
 MUFI_4_FILENAME = "MUFI_4_DICT.tsv"
 STOPWORD_FILENAME = "stopwords.p"
+# CONSOLIDATION_FILENAME = "consolidations.p"
+PATTERN_REPLACEMENTS_FILENAME = "pattern_replacements.p"
 LEMMA_FILENAME = "lemmas.p"
-CONSOLIDATION_FILENAME = "consolidations.p"
 SPECIAL_CHAR_FILENAME = "specialchars.p"
 DIGIT_MAP_FILENAME = "digitmap.p"
 PUNCTUATION_MAP_FILENAME = "punctuationmap.p"
@@ -84,14 +85,14 @@
 SCRUBINPUTS = (
     "stop_words",
     "special_characters",
-    "consolidations",
+    "pattern_replacements",
     "lemmas"
 )
 
 OPTUPLOADNAMES = (
     "stop_words_file[]",
     "lemmas_file[]",
-    "consolidations_file[]",
+    "pattern_replacements_file[]",
     "special_characters_file[]"
 )
 
@@ -209,13 +210,13 @@
     "stop_words": "",
     "stop_words_method": "Off",
     "special_characters": "",
-    "consolidations": "",
+    "pattern_replacements": "",
     "lemmas": "",
     "special_characters_preset": "None",
     "file_uploads": {
         "stop_words_file[]": "",
         "lemmas_file[]": "",
-        "consolidations_file[]": "",
+        "pattern_replacements[]": "",
         "special_characters_file[]": ""}}
 
 DEFAULT_CUT_OPTIONS = {

diff --git a/lexos/helpers/error_messages.py b/lexos/helpers/error_messages.py
@@ -33,6 +33,9 @@
     "Too many values on right side of replacement string."
 REPLACEMENT_NO_LEFT_HAND_MESSAGE = \
     "Missing value on the left side of replacement string."
+UNESCAPED_GREATER_THAN_SIGN_MESSAGE = "Please place a backslash before all" \
+                            " examples of `>` in the pattern you wish to" \
+                            " replace."
 # ----------------------------------------------------------------------------
 
 

diff --git a/lexos/processors/prepare/scrubber.py b/lexos/processors/prepare/scrubber.py
@@ -12,7 +12,8 @@
 from lexos.helpers import constants as constants, \
     general_functions as general_functions
 from lexos.helpers.error_messages import NOT_ONE_REPLACEMENT_COLON_MESSAGE, \
-    REPLACEMENT_RIGHT_OPERAND_MESSAGE, REPLACEMENT_NO_LEFT_HAND_MESSAGE
+    REPLACEMENT_RIGHT_OPERAND_MESSAGE, REPLACEMENT_NO_LEFT_HAND_MESSAGE, \
+    UNESCAPED_GREATER_THAN_SIGN_MESSAGE
 from lexos.helpers.exceptions import LexosException
 
 
@@ -192,6 +193,64 @@ def replacement_handler(text: str,
     return text
 
 
+def pattern_replacement_handler(text: str,
+                                replacer_string: str) -> str:
+    """Handle pattern replacement lines found in scrub-alteration-upload files.
+
+    :param text: A unicode string with the whole text to be altered.
+    :param replacer_string: A formatted string input with newline-separated
+        "replacement lines", where each line is formatted to replace the
+        majority of the words with one word.
+    :returns: The input string with replacements made.
+    """
+
+    # Convert HTML character entities to Unicode if HTML is selected *and*
+    #  there are further entities entered in the form field
+    if request.form['special_characters_preset'] == 'HTML':
+        text = html.unescape(text)
+
+    # Remove spaces in replacement string for consistent format, then split the
+    # individual replacements to be made
+    # Not sure if this is needed
+    no_space_replacer = replacer_string.translate({ord(" "): None}).strip('\n')
+
+    # Handle excess blank lines in file, etc.
+    replacement_lines = [token for token in no_space_replacer.split('\n')
+                         if token != ""]
+
+    replacement_jobs = []
+    # Search for all examples of > not preceded by a backslash
+    pat_for_sep = re.compile(r'(?<!\\)>')
+    for replacement_line in replacement_lines:
+        # There is more than one potential separator, raise an error
+        if len(re.findall(pat_for_sep, replacement_line)) > 1:
+            raise LexosException(
+                UNESCAPED_GREATER_THAN_SIGN_MESSAGE + replacement_line)
+        # Otherwise, define a replacement tuple
+        else:
+            # Remove whitespace around the separator and then split
+            replacement_line = re.sub(r'\s+>\s+', '>', replacement_line)
+            pattern, substitution = re.split(pat_for_sep, replacement_line)
+            # Handle string internal greater than sign
+            substitution = substitution.replace('\\>', '>')
+            # Convert \s token to a space
+            substitution = substitution.replace('\\s', ' ')
+            # If the pattern has the prefix REGEX:, remove it,
+            # and set regex=True
+            if pattern.lower().startswith('regex:'):
+                regex = True
+                pattern = re.sub(r'^REGEX:', '', pattern, flags=re.IGNORECASE)
+            else:
+                regex = False
+            replacement_jobs.append((regex, pattern, substitution))
+            # Do the replacement
+            if regex is True:
+                text = re.sub(pattern, substitution, text)
+            else:
+                text = text.replace(pattern, substitution)
+    return text
+
+
 def replace_with_dict(text: str, replacement_dict: Dict[str, str],
                       edge1: str, edge2: str) -> str:
     """Alters text according to the replacements dictionary.
@@ -813,9 +872,9 @@ def prepare_additional_options(opt_uploads: Dict[str, FileStorage],
         option text fields and files.
     """
 
-    file_strings = {'consolidations_file[]': '', 'lemmas_file[]': '',
+    file_strings = {'pattern_replacements_file[]': '', 'lemmas_file[]': '',
                     'special_characters_file[]': '', 'stop_words_file[]': '',
-                    'consolidations': '', 'lemmas': '',
+                    'pattern_replacements': '', 'lemmas': '',
                     'special_characters': '', 'stop_words': ''}
 
     for index, key in enumerate(sorted(opt_uploads)):
@@ -831,14 +890,15 @@ def prepare_additional_options(opt_uploads: Dict[str, FileStorage],
             file_strings[key] = ""
 
     # Create an array of option strings:
-    # cons_file_string, lem_file_string, sc_file_string, sw_kw_file_string,
-    #     cons_manual, lem_manual, sc_manual, and sw_kw_manual
+    # pat_replacements_file_string, lem_file_string, sc_file_string,
+    # sw_kw_file_string, pattern_replacements_manual, lem_manual, sc_manual,
+    # and sw_kw_manual
 
-    all_options = [file_strings.get('consolidations_file[]'),
+    all_options = [file_strings.get('pattern_replacements_file[]'),
                    file_strings.get('lemmas_file[]'),
                    file_strings.get('special_characters_file[]'),
                    file_strings.get('stop_words_file[]'),
-                   request.form['consolidations'],
+                   request.form['pattern_replacements'],
                    request.form['lemmas'],
                    request.form['special_characters'],
                    request.form['stop_words']]
@@ -884,33 +944,34 @@ def scrub(text: str, gutenberg: bool, lower: bool, punct: bool, apos: bool,
 
     storage_filenames = sorted(
         [constants.STOPWORD_FILENAME, constants.LEMMA_FILENAME,
-         constants.CONSOLIDATION_FILENAME, constants.SPECIAL_CHAR_FILENAME])
+         constants.PATTERN_REPLACEMENTS_FILENAME,
+         constants.SPECIAL_CHAR_FILENAME])
     option_strings = prepare_additional_options(
         opt_uploads, storage_options, storage_folder, storage_filenames)
 
-    # handle uploaded FILES: consolidations, lemmas, special characters,
+    # handle uploaded FILES: pattern_replacements, lemmas, special characters,
     # stop-keep words
-    cons_file_string = option_strings[0]
+    pat_replacements_file_string = option_strings[0]
     lem_file_string = option_strings[1]
     sc_file_string = option_strings[2]
     sw_kw_file_string = option_strings[3]
 
-    # handle manual entries: consolidations, lemmas, special characters,
+    # handle manual entries: pattern_replacements, lemmas, special characters,
     # stop-keep words
-    cons_manual = option_strings[4]
+    pattern_replacements_manual = option_strings[4]
     lem_manual = option_strings[5]
     sc_manual = option_strings[6]
     sw_kw_manual = option_strings[7]
 
     # Scrubbing order:
     #
-    # Note:  lemmas and consolidations do NOT work on tags; in short,
+    # Note:  lemmas and pattern_replacements do NOT work on tags; in short,
     #        these manipulations do not change inside any tags
     #
     # 0. Gutenberg
     # 1. lower
     #    (not applied in tags ever;
-    #    lemmas/consolidations/specialChars/stopKeepWords changed;
+    #    lemmas/pattern_replacements/specialChars/stopKeepWords changed;
     #    text not changed at this point)
     # 2. special characters
     # 3. tags - scrub tags
@@ -920,7 +981,7 @@ def scrub(text: str, gutenberg: bool, lower: bool, punct: bool, apos: bool,
     # 5. digits (text not changed at this point, not applied in tags ever)
     # 6. white space (text not changed at this point, not applied in tags ever,
     #    otherwise tag attributes will be messed up)
-    # 7. consolidations
+    # 7. pattern_replacements
     #    (text not changed at this point, not applied in tags ever)
     # 8. lemmatize (text not changed at this point, not applied in tags ever)
     # 9. stop words/keep words
@@ -929,7 +990,7 @@ def scrub(text: str, gutenberg: bool, lower: bool, punct: bool, apos: bool,
     # apply:
     # 0. remove Gutenberg boiler plate (if any)
     # 1. lowercase
-    # 2. consolidation
+    # 2. pattern_replacements
     # 3. lemmatize
     # 4. stop words
     # 5. remove punctuation, digits, and whitespace without changing all the
@@ -956,13 +1017,13 @@ def to_lower_function(orig_text: str) -> str:
 
         # since lower is ON, apply lowercase to other options
         # apply to contents of any uploaded files
-        cons_file_string = cons_file_string.lower()
+        pat_replacements_file_string = pat_replacements_file_string.lower()
         lem_file_string = lem_file_string.lower()
         sc_file_string = sc_file_string.lower()
         sw_kw_file_string = sw_kw_file_string.lower()
 
         # apply to contents manually entered
-        cons_manual = cons_manual.lower()
+        pattern_replacements_manual = pattern_replacements_manual.lower()
         lem_manual = lem_manual.lower()
         sc_manual = sc_manual.lower()
         sw_kw_manual = sw_kw_manual.lower()
@@ -1039,21 +1100,23 @@ def total_removal_function(orig_text: str) -> str:
         """
         return orig_text.translate(total_removal_map)
 
-    # -- 7. consolidations ---------------------------------------------------
-    def consolidation_function(orig_text: str) -> str:
+    # -- 7. pattern_replacements --------------------------------------------
+    def pattern_replacements_function(orig_text: str) -> str:
         """Replaces characters according to user input strings.
 
         :param orig_text: A text string.
-        :return: The text with characters swapped according to cons_file_string
-            and cons_manual.
+        :return: The text with characters swapped according to
+            pat_replacements_file_string and pattern_replacements_manual.
         """
 
         replacer_string = handle_file_and_manual_strings(
-            file_string=cons_file_string, manual_string=cons_manual,
-            storage_folder=storage_folder, storage_filenames=storage_filenames,
+            file_string=pat_replacements_file_string,
+            manual_string=pattern_replacements_manual,
+            storage_folder=storage_folder,
+            storage_filenames=storage_filenames,
             storage_number=0)
-        text = replacement_handler(
-            text=orig_text, replacer_string=replacer_string, is_lemma=False)
+        text = pattern_replacement_handler(
+            text=orig_text, replacer_string=replacer_string)
         return text
 
     # -- 8. lemmatize --------------------------------------------------------
@@ -1106,7 +1169,7 @@ def stop_keep_words_function(orig_text: str) -> str:
 
     # apply all the functions and exclude tag
     functions = [to_lower_function,
-                 consolidation_function,
+                 pattern_replacements_function,
                  lemmatize_function,
                  total_removal_function,
                  stop_keep_words_function]

diff --git a/lexos/static/help/scrub-help.html b/lexos/static/help/scrub-help.html
@@ -75,7 +75,7 @@ <h3 class="help-section-paragraph">
         Converts all uppercase characters to lowercase characters so that the
         tokens "The" and "the" will be considered as the same term. In addition,
         all contents (whether in uploaded files or entered manually) for the
-        Stop Words/Keep Words, Lemmas, Consolidations, or Special Characters
+        Stop Words/Keep Words, Pattern Replacements, Lemmas, or Special Characters
         options will also have all uppercase characters changed to lowercase.
         Lowercase is not applied inside any <code>HTML</code>, <code>XML</code>,
         or <code>SGML</code> markup tags remaining in the text.
@@ -212,6 +212,44 @@ <h3 class="help-section-paragraph">
         <a target="_blank" href="https://github.com/WheatonCS/Lexos/tree/master/test/test_suite/Scrubbing/Stop%20Words%20%26%20Keep%20Words"><u>click here</u></a>
     </h3>
 
+    <li><b>Pattern Replacements</b></li>
+    <h3 class="help-section-paragraph">
+        Replaces a list of character patterns with substitutions. This is
+        typically to consolidate symbols considered equivalent or to provide
+        highly specific cleaning functions.
+    </h3>
+    <h3 class="help-section-paragraph">
+        For example, in Old English, the character "eth" <em>ð</em>  is interchangeable
+        with the character "thorn" <em>þ</em>. The Pattern Replacements option allows you to
+        choose to merge the two using a single character.
+    </h3>
+    <h3 class="help-section-paragraph">
+        Pattern replacements should be entered in the format: <pre>ð > þ</pre> Where you wish to
+        change all occurrences of <code>ð</code> to <code>þ</code>. Multiple
+        consolidations can be separated by commas or line breaks. All replacements will be
+        applied in the order in which they are listed.
+    </h3>
+    <h3 class="help-section-paragraph">
+        If you find that you need to replace the <code>&gt;</code> character itself, place a backslash
+        before it like <code>\></code>. To replace a character with a space, use <code>\s</code>. For
+        instance, the pattern <code>- > \s</code> will replace hyphens with spaces.
+    </h3>
+    <h3 class="help-section-paragraph">
+        Lexos accepts regular expression (regex) patterns if the pattern begins with <code>REGEX:</code>.
+        For example, <code>REGEX:^c > k</code> will change <em>c</em> to <em>k</em> only at the beginning
+        of the text. If you are using regex capture groups, you can reference them with <code>\1</code>,
+        <code>\2</code>, etc. For example, the pattern <code>REGEX:(hi)-(ho) > \2-\1</code> will change
+        "hi-ho" to "ho-hi". A useful regular expressions tutorial can be found at
+        <a target="_blank" href="https://regexone.com/"><u>RegexOne</u></a>. Regex patterns can also be
+        tested at <a target="_blank" href="https://regex101.com/">Regex101</a>.
+    </h3>
+    <h3 class="help-section-paragraph">
+        Pattern replacements can be entered manually in the provided form field or
+        uploaded from a file. Note that the "Make Lowercase" option will be applied
+        to your list of characters if that option is also selected. To replace
+        entire words (terms) with other words, you should use the <b>Lemma</b> option.
+    </h3>
+
     <li><b>Lemmas</b></li>
     <h3 class="help-section-paragraph">
         Replaces all instances of terms in a list with a common replacement
@@ -234,28 +272,6 @@ <h3 class="help-section-paragraph">
         You can manually enter a list of lemmas or upload your own.
     </h3>
 
-    <li><b>Consolidations</b></li>
-    <h3 class="help-section-paragraph">
-        Replaces a list of characters with a different character. This is
-        typically to consolidate symbols considered equivalent.
-    </h3>
-    <h3 class="help-section-paragraph">
-        For example, in Old English, the character "eth" ð  is interchangeable
-        with the character "thorn" þ. The Consolidations option allows you to
-        choose to merge the two using a single character.
-    </h3>
-    <h3 class="help-section-paragraph">
-        Consolidations should be entered in the format: <pre>ð: þ</pre> Where you wish to
-        change all occurrences of <code>ð</code> to <code>þ</code> . Multiple
-        consolidations can be separated by commas or line breaks.
-    </h3>
-    <h3 class="help-section-paragraph">
-        Consolidations can be entered manually in the provided form field or
-        uploaded from a file. Note that the "Make Lowercase" option will be applied
-        to your list of characters if that option is also selected. To replace
-        entire words (terms) with other words, you should use the <b>Lemma</b> option.
-    </h3>
-
     <li><b>Special Characters</b></li>
     <h3 class="help-section-paragraph">
         Replaces character entities with their glyph equivalents.
@@ -287,31 +303,3 @@ <h3 class="help-section-paragraph">
         Multiple transformation rules should be listed on separate lines.
     </h3>
 </ul>
-
-<!-- Replacing Patterns -->
-<h3 class="help-section-title">Replacing Patterns</h3>
-
-<h3 class="help-section-paragraph">
-    Sometimes it is necessary to replace a pattern rather than a precise string.
-    For instance, if a document contains multiple URLs like
-    <code>http://lexos.wheatoncollege.edu</code> and
-    <code>http://scalar.usc.edu/works/lexos/</code>, and you need to strip
-    these URLs, a method is required for matching all URLs without knowing
-    what they are in advance. One technique is to apply regular expression
-    (regex) pattern matching. Lexos uses regular expressions internally to
-    perform some of its scrubbing options, but, as of version 3.0, it does
-    not provide a way for users to supply their own regular expression
-    patterns when scrubbing. If users need to strip or replace patterns by
-    regular expressions, it will be necessary to perform that action using
-    a separate script or tool <em><u>prior to using Lexos</u></em>. A useful
-    regular expressions tutorial can be found at
-    <a target="_blank" href="https://regexone.com/"><u>RegexOne</u></a>.
-    Most modern text editors like
-    <a target="_blank" href="https://www.sublimetext.com/"><u>Sublime Text</u></a> and
-    <a target="_blank" href="http://www.barebones.com/products/TextWrangler/"><u>TextWrangler</u></a>
-    accept regular expressions in their search and replace functions, and
-    users may find them to be a convenient means of performing actions with
-    regular expressions. We hope to add a regular expression pattern matching
-    to Lexos in the future.
-</h3>
-