"""Provides various scoring methods for word strength."""
import re
import fuzzy
dmeta = fuzzy.DMetaphone()
soundex = fuzzy.Soundex(4)
[docs]def score_soundex(words):
"""Score words using the soundex algorithm.
Args:
words (list) - the list of words.
Returns:
scores (list) - the scored words
"""
return ['{}: {}'.format(w.lower(), soundex(w)) for w in words]
[docs]def score_nysiis(words):
"""Score words using the nysiis algorithm.
Args:
words (list) - the list of words.
Returns:
scores (list) - the scored words
"""
return ['{}: {}'.format(w.lower(), fuzzy.nysiis(w)) for w in words]
[docs]def score_length(word):
"""Return a score, 1-5, of the length of the word.
Really long, or really short words get a lower score.
There is no hard science, but popular opinion suggests
that a word somewhere between 8-15 letters is optimal.
Args:
word (str): The word to score.
Returns:
score (int): The resulting score.
"""
if not word or len(word) == 0:
return 0
_len = len(word)
# 20+
if _len > 20:
return 1
# 15-20
elif _len > 15 and _len <= 20:
return 2
# 1-4
elif _len <= 4:
return 3
# 10-15
elif _len >= 10 and _len <= 15:
return 4
# 5-10
elif _len > 4 and _len < 10:
return 5
[docs]def bounded(num, start, end):
"""Determine if a number is within the bounds of `start` and `end`.
Args:
num (int): An integer.
start (int): A start minimum.
end (int): An end maximum.
Returns:
is_bounded (bool): Whether number is bounded by start and end.
"""
return num >= start and num <= end
[docs]def score_pronounceability(word):
"""Get the ratio of vowels to consonants, a very basic measurement.
Half vowels and half consonants indicates a highly pronounceable word.
For example, 0.5 / 0.5 = 1.0, so one is perfect, and lower is worse.
The 1-5 scale translation:
0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0
0 1 2 3 4 5 4 3 2 1 5
Args:
word (string) - the name
Returns:
score (int) - the final pronounceability score
"""
if not word or len(word) == 0:
return 0
word = re.sub(r'[^a-zA-Z0-9]', '', word)
re_vowels = re.compile(r'[a|e|i|o|u]')
re_cons = re.compile(r'[^a|e|i|o|u]')
vowels = float(len(re.findall(re_vowels, word)))
consonants = float(len(re.findall(re_cons, word)))
if vowels is 0.0 or consonants is 0.0:
return 0
if vowels < consonants:
ratio = vowels / consonants
else:
ratio = consonants / vowels
if ratio == 0.0:
return 0
if ratio == 1.0:
return 5
if bounded(ratio, 0.0, 0.1) or bounded(ratio, 0.9, 1.0):
return 1
if bounded(ratio, 0.1, 0.2) or bounded(ratio, 0.8, 0.9):
return 2
if bounded(ratio, 0.2, 0.3) or bounded(ratio, 0.7, 0.8):
return 3
if bounded(ratio, 0.3, 0.4) or bounded(ratio, 0.6, 0.7):
return 4
if bounded(ratio, 0.4, 0.5) or bounded(ratio, 0.5, 0.6):
return 5
return 0
[docs]def score_simplicity(word):
"""Determine how simple the word is.
Simple is defined as the number of separate words.
In this case, higher is better, indicating a better score.
Args:
word (string) - the name
Returns:
score (int) - the final simplicity score
>>> score_simplicity('the cat in the hat')
>>> 1
>>> score_simplicity('facebook')
>>> 5
"""
if not word or len(word) == 0:
return 0
word_count = len(re.split(r'[^a-z]', word))
if word_count == 1:
return 5
if word_count < 3:
return 4
if word_count < 4:
return 3
if word_count < 5:
return 2
# After 4+ words, the name has a very poor score.
return 1
[docs]def score_name_overall(word):
"""Score the name using separate scoring functions, then normalize to 100.
This method gives an overall intuitive score.
The closer to 100%, the better.
Args:
word (string) - the name
Returns:
score (float) - the final name score
"""
length = score_length(word)
pronounceability = score_pronounceability(word)
simplicity = score_simplicity(word)
_scores = sum([length, pronounceability, simplicity])
score = round(_scores * 10)
# cut off at 100%
if score > 100:
return 100
return score
[docs]def score_names_overall(words):
"""Score all names.
Args:
words (list) - the list of words.
Returns:
words (list) - a list of tuples, with the score and word.
"""
return [(score_name_overall(w), w) for w in words]
[docs]def generate_all_scoring(words):
"""Return all scoring methods for a set of words.
Args:
words (list) - the list of words.
Returns:
words (dict) - the scores, keyed by scoring name.
"""
return {
'dmetaphone': score_dmetaphone(words),
'soundex': score_soundex(words),
'nysiis': score_nysiis(words),
'grade': score_names_overall(words)
}