Source code for assembl.nlp.wordcounter

"""Count number of occurences of stemmed words for the creativity widget."""
import re
from collections import defaultdict
from future.utils import as_native_str

from . import (
    locale_to_lang, get_stop_words, known_languages, get_stemmer)

[docs]class StemSet(set): def __init__(self): super(StemSet, self).__init__() self.counter = 0.0
[docs] def add(self, word, weight=1.0): super(StemSet, self).add(word) self.counter += weight
def shortest(self): all_words = list(self) all_words.sort(key=len) return all_words[0] @as_native_str() def __repr__(self): return super(StemSet, self).__repr__()[:-1] + ", %f)" % ( self.counter)
[docs]class WordCounter(defaultdict): non_words = re.compile('\W+', re.U) def __init__(self, langs, min_len=3): super(WordCounter, self).__init__(StemSet) self.min_len = min_len self.langs = [] # We will base stemmer on first known language. stemmer = None stopwords = set() for locale in langs: lang = locale_to_lang(locale) if lang in known_languages: stopwords.update(get_stop_words(lang)) self.langs.append(lang) stemmer = stemmer or get_stemmer(lang, False) self.stemmer = stemmer or get_stemmer(None) self.stop_words = stopwords def add_text(self, text, weight=1.0): for word in self.non_words.split(text): self.add_word(word, weight) def add_word(self, word, weight=1.0): if word.lower() in self.stop_words: return if len(word) < self.min_len: return stemmed = self.stemmer.stemWord(word.lower()) self[stemmed].add(word, weight) def best(self, num=10): all_words = list(self.values()) all_words.sort(key=lambda x: x.counter, reverse=True) if len(all_words) > num: all_words = all_words[:num] return [x.shortest() for x in all_words]