Source code for namebot.normalization

"""Helpers to normalize inputs and text."""

import re
import string
from collections import defaultdict

# from nltk.corpus import stopwords

from pattern.vector import PORTER
from pattern.vector import stem

import settings as namebot_settings


[docs]def flatten(lst): """Flatten a list with arbitrary levels of nesting. CREDIT: http://stackoverflow.com/questions/10823877/ what-is-the-fastest-way-to-flatten-arbitrarily-nested-lists-in-python Changes made include: 1. Adding error handling, 2. Renaming variables, 3. Using `any` instead of `or`. See http://creativecommons.org/licenses/by-sa/3.0/ for specific details. Args: lst (list): The nested list. Returns: (generator): The new flattened list of words. """ if not isinstance(lst, list): yield [] for i in lst: if any([isinstance(i, list), isinstance(i, tuple)]): for j in flatten(i): yield j else: yield i
[docs]def remove_odd_sounding_words(words): """Remove random odd sounding word combinations via regular expressions. Args: words (list): The list of words Returns: list: An updated word list with words cleaned. """ odd_regexes = [ re.compile(r'^a|e|i|o|u|y{3,6}'), # bk, ck, dk, gk, etc... re.compile(r'\b[^aeiouys]k|zt|ksd|kd|zhr'), re.compile(r'\bzt|ksd|kd|zhr') ] cleaned = [] if words is None or len(words) == 0: return words # Loop through any number of # regexes and add only if no matches exist [cleaned.append(word) for word in words if not any( re.match(regex, word) for regex in odd_regexes)] return cleaned
[docs]def stem_words(words): """Stem words to their base linguistic stem to remove redundancy. Args: words (list): The list of words Returns: list: An updated word list with words stemmed. """ new = [] for val in words: val = stem(val, stemmer=PORTER) new.append(val) return new
[docs]def remove_stop_words(words): """Remove all stop words. Args: words (list): The list of words Returns: list: An updated word list with stopwords removed. """ stop_words = stopwords.words('english') # http://stackoverflow.com/questions/5486337/ # how-to-remove-stop-words-using-nltk-or-python newdata = [w for w in words if w.lower() not in stop_words] # newdata = set(stopwords.words('english')) return newdata
[docs]def remove_bad_words(words): """Remove naughty words that might come from wordnet synsets and lemmata. Args: words (list): The list of words Returns: list: An updated word list with bad words removed. """ bad_words = ["nigger", "wop", "kike", "faggot", "fuck", "pussy", "cunt"] newdata = [word for word in words if word.lower() not in bad_words] return newdata
[docs]def filter_words(words): """Filter words by default min/max settings in the settings module. Args: words (list): The list of words Returns: list: The filtered words """ new_arr = [] for word in words: if not re.search(' ', word): lte = len(word) <= namebot_settings.MAX_LENGTH gte = len(word) >= namebot_settings.MIN_LENGTH if all([lte, gte]): new_arr.append(word) elif re.search(' ', word): split = re.split(' ', word) split_join = [] for chunks in split: length = len(chunks) lte = length <= namebot_settings.SPACED_MAX_LENGTH gte = length >= namebot_settings.MIN_LENGTH if all([lte, gte]): split_join.append(chunks) new_arr.append( ' '.join(split_join)) return new_arr
[docs]def uniquify(words): """Remove duplicates from a list. Args: words (list): The list of words Returns: list: An updated word list with duplicates removed. """ return {}.fromkeys(words).keys() if words is not None else words
[docs]def clean_sort(words): """A function for cleaning and prepping words for techniques. Args: words (list): The list of words Returns: list: An updated word list with words cleaned and sorted. """ if isinstance(words, basestring): return words chars = '!"#$%\'()*+,._/:;<=>?@[\\]^`{|}~01234567890' if words is not None: try: words = [word.strip().lower().translate( string.maketrans('', ''), chars) for word in words if len(word) > 1] except TypeError: pass return words
[docs]def chop_duplicate_ends(word): """Remove duplicate letters on either end, if the are adjacent. Args: words (list): The list of words Returns: list: An updated word list with duplicate ends removed for each word. """ if word[0] == word[1]: word = word[1:] if word[-2:-1] == word[-1:]: word = word[:-1] return word
[docs]def key_words_by_pos_tag(words): """Key words by the pos tag name, given when using pos_tag on a list. Args: words (list): The list of words, where each item is a 2-tuple. Returns: dict: An updated dictionary keyed by pos tag, with values as a list of matching pos matching words. """ alltags = defaultdict(list) for word, pos in words: alltags[pos].append(word) return alltags