"""Metrics for measuring various aspects of words.
Conventions used in this utility:
1. All functions return a dictionary,
with key 'data' and/or 'summary':
return {
'data': data,
'summary': summary or None
}
"""
from __future__ import division
import re
# from nltk import pos_tag
from pattern.en import parse
[docs]def prep_file(file_name):
"""Take a file, extracts items line-by-line, and returns a list of them.
Args:
file_name (str): The file name to open
Returns:
items (list): A list of items extracted from the file
"""
items = []
with open(file_name) as files:
for newline in files:
items.append(newline)
return items
[docs]def get_named_numbers_1_10(words):
"""Return a summary of words spelled out (e.g. one, two).
Args:
words (list): A list of words
Returns:
dict: The data and summary results.
"""
matches = []
numbers = re.compile(
r'\Aone |two |three |four |five |six |seven |eight |nine |ten',
re.IGNORECASE)
for word in words:
if re.findall(numbers, word):
matches.append(word)
return {
'data': matches,
'summary': 'Of {} words, {} matched'.format(len(words), len(matches))
}
[docs]def name_length(words):
"""Check the length of each word and an average.
Args:
words (list): A list of words
Returns:
dict: The data and summary results.
"""
names_length = []
for val in words:
names_length.append(len(val))
summary = 'Of {} words, the average length of names is...{}'.format(
len(words),
round(sum(names_length) / len(names_length)))
return {
'data': names_length,
'summary': summary
}
[docs]def name_vowel_count(words):
"""Check the number of times vowels occurs, and total the results.
Args:
words (list): A list of words
Returns:
dict: The data and summary results.
"""
num_count = {'a': 0, 'e': 0, 'i': 0, 'o': 0, 'u': 0}
try:
for word in words:
num_count['a'] += word.count('a')
num_count['e'] += word.count('e')
num_count['i'] += word.count('i')
num_count['o'] += word.count('o')
num_count['u'] += word.count('u')
except AttributeError:
pass
finally:
return {
'data': num_count,
'summary': None
}
[docs]def name_starts_with_vowel(words):
"""Check the number of times a list of words starts with a vowel.
Args:
words (list): A list of words
Returns:
dict: The data and summary results.
"""
vowelcount = 0
vowels = re.compile(r'\A[aeiou]')
for name in words:
if re.match(vowels, name):
vowelcount += 1
summary = 'Of {} words, {} or {}% are vowels as the first letter.'.format(
len(words), vowelcount,
round(float(vowelcount) / len(words) * 100))
return {
'data': None,
'summary': summary
}
[docs]def get_digits_frequency(words):
"""Look for and count the digits in names, e.g. 7-11, 3M, etc...
Args:
words (list): A list of words
Returns:
dict: The data and summary results.
"""
new_words = []
count = 0
digits = re.compile(r'[0-9]+')
for name in words:
if re.findall(digits, name):
count += 1
matches = re.findall(digits, name)
new_words += matches
return {
'data': new_words,
'summary': ('Of {} words, {} have numbers in them, '
'with a total of {} numbers found.').format(
len(words), count, len(new_words))
}
[docs]def get_first_letter_frequency(words):
"""Add the frequency of first letters e.g. [C]at, [C]law, c = 2.
Args:
words (list): A list of words
Returns:
dict: The data and summary results.
"""
letters = {}
# populate keys
for name in words:
letters[name[0]] = 0
# add counts
for name in words:
letters[name[0]] += 1
return {
'data': letters,
'summary': None
}
[docs]def get_special_chars(words):
"""Find occurrences of special characters (non-alphabetical characters).
Args:
words (list): A list of words
Returns:
dict: The data and summary results.
"""
data = []
chars = re.compile(r'[^a-z]', re.IGNORECASE)
for word in words:
if re.findall(chars, word):
data += re.findall(chars, word)
return {
'data': data,
'summary': ('{} occurrences of special characters were'
' found in {} words.').format(len(data), len(words))
}
[docs]def get_word_types(words):
"""Determine the occurrences of pos types.
Args:
words (list): A list of words
Returns:
dict: The data and summary results.
"""
new_arr = []
for val in words:
try:
val = parse(
val,
encoding='utf-8',
tokenize=False,
light=False,
tags=True,
chunks=False,
relations=False,
lemmata=False)
new_arr.append(val)
except IndexError:
continue
return {
'data': new_arr,
'summary': None
}
[docs]def get_name_spaces(words):
"""Check number of spaces for a given set of words.
Args:
words (list): A list of words
Returns:
dict: The data and summary results.
"""
results = [{'word': word, 'spaces': len(word.split(r' '))}
for word in words]
return {
'data': results,
'summary': None
}
[docs]def get_consonant_repeat_frequency(words):
"""Check for repeating consonant frequency for a given set of words.
Args:
words (list): A list of words
Returns:
dict: The data and summary results.
"""
count = 0
cons = re.compile(r'[^a|e|i|o|u{6}]')
for val in words:
if re.match(cons, val):
count += 1
return {
'data': count,
'summary': None
}
[docs]def get_consonant_duplicate_repeat_frequency(words):
"""Check for duplicate repeating consonant frequency.
Args:
words (list): A list of words
Returns:
dict: The data and summary results.
"""
count = 0
cons_dup = re.compile(r'[^a|e|i|o|u]{1,}')
for name in words:
if re.match(cons_dup, name):
count += 1
return {
'data': count,
'summary': None
}
[docs]def get_vowel_repeat_frequency(words):
"""Check for repeating vowel frequency for a given set of words.
Args:
words (list): A list of words
Returns:
dict: The data and summary results.
"""
count = 0
cons_vowel = re.compile(r'[aeiou{3}]')
for val in words:
if re.match(cons_vowel, val):
count += 1
return {
'data': count,
'summary': None
}
[docs]def get_adjective_verb_or_noun(words):
"""Get the number of words that are classified as verbs or nouns.
Args:
words (TYPE): Description
Returns:
dict: The data and summary results.
"""
total = len(words)
data = {'verbs': 0, 'nouns': 0}
verby = ['VBP', 'VB', 'RB', 'VBG']
nouns = ['NN', 'NNP']
for word, tag in pos_tag(words):
if tag in nouns:
data['nouns'] += 1
elif tag in verby:
data['verbs'] += 1
remainder = total - (data['verbs'] + data['nouns'])
return {
'data': data,
'summary': ('Of {0} words, {1} were nouns, {2} were verbs, '
'and {3} were everything else.').format(
total, data['nouns'], data['verbs'], remainder)
}
[docs]def categorize_word_type(words):
"""Get the common naming strategy 'category' of a name, based on precedence.
Categories are derived from
http://www.thenameinspector.com/10-name-types/,
so it is important to note there is no agreed upon standard,
meaning it is ultimately a little arbitrary.
Since it is a bit challenging to actually determine its type,
we give a weighting for each word based on a few known metrics.
This can be updated in the future so that weightings are binary
(e.g. 0.0 and 100.0), giving traditional False/True.
Categories ====
1. Real Words
1a. Misspelled words
1b. Foreign words
2. Compounds
3. Phrases
4. Blends
5. Tweaked
6. Affixed
7. Fake/obscure
8. Puns
9. People's names
10. Initials and Acronyms
Args:
words (list): A list of words
Returns:
new_words (list) - A list of lists, with each word
and its distribution by word "type"
"""
new_words = []
def _get_distribution(word):
# TODO:
# misspelled, foreign, tweaked, affixed, fake_obscure,
# initials_acronym, blend, puns, person, compound
"""Return the distribution for all categories, given a single word."""
categories = {
'real': 0,
'misspelled': 0,
'foreign': 0,
'compound': 0,
'phrase': 0,
'blend': 0,
'tweaked': 0,
'affixed': 0,
'fake_obscure': 0,
'puns': 0,
'person': 0,
'initials_acronym': 0,
}
if len(word.split(' ')) == 1:
# Real words are single
categories['real'] = 50
else:
# Phrases are not
categories['phrase'] = 50
# If word cannot be tagged,
# it's very likely fake_obscure
if pos_tag([word])[0][1] == '-NONE-':
categories['real'] = 0
categories['fake_obscure'] = 75
return categories
for word in words:
new_words.append([word, _get_distribution(word)])
return new_words
[docs]def generate_all_metrics(filename=None, words=None):
"""Generate all metrics in this module in one place.
Args:
filename (str, optional): A filename to load words from.
words (TYPE, optional): Words to use, if file is not specified.
Returns:
dict: All metrics results, keyed by name.
"""
if not filename and not words:
return None
if filename:
allnames = prep_file(filename)
else:
allnames = words
return {
'names': allnames,
'metrics': {
'digits_freq': get_digits_frequency(allnames),
'length': name_length(allnames),
'vowel_beginning': name_starts_with_vowel(allnames),
'vowel_count': name_vowel_count(allnames),
'name_length': name_length(allnames),
'name_spaces': get_name_spaces(allnames),
'consonant_repeat_freq': get_consonant_repeat_frequency(allnames),
'consonant_dup_repeat_freq': get_consonant_duplicate_repeat_frequency(allnames),
'vowel_repeat_freq': get_vowel_repeat_frequency(allnames),
'special_characters': get_special_chars(allnames),
'name_numbers': get_named_numbers_1_10(allnames),
'adj_verb_noun': get_adjective_verb_or_noun(allnames),
'first_letter_freq': get_first_letter_frequency(allnames),
'word_types': get_word_types(allnames)
}
}