Python nltk.corpus.stopwords 模块,words() 实例源码
我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用nltk.corpus.stopwords.words()。
def SpeechToText():
r = sr.Recognizer() #Speech recognition
with sr.Microphone() as source:
print("Say something!")
audio = r.listen(source)
message = r.recognize_google(audio)
print("Check: "+message)
try:
print("User: " + r.recognize_google(audio))
except sr.UnkNownValueError:
print("Google Speech Recognition Could not understand audio")
except sr.RequestError as e:
print("Could not request results from Google Speech Recognition service; {0}".format(e))
return message
#function to find importance of words to use them to deduce that which thing is being asked more
def preprocessing(text):
text = text.decode("utf8")
# tokenize into words
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
# remove stopwords
stop = stopwords.words('english')
tokens = [token for token in tokens if token not in stop]
# remove words less than three letters
tokens = [word for word in tokens if len(word) >= 3]
# lower capitalization
tokens = [word.lower() for word in tokens]
# lemmatize
lmtzr = WordNetLemmatizer()
tokens = [lmtzr.lemmatize(word) for word in tokens]
preprocessed_text= ' '.join(tokens)
return preprocessed_text
def collection_stats():
# list of documents
documents_stat = reuters.fileids()
print(str(len(documents)) + " documents")
train_docs_stat = list(filter(lambda doc: doc.startswith("train"), documents_stat))
print(str(len(train_docs_stat)) + " total training documents")
test_docs_stat = list(filter(lambda doc: doc.startswith("test"), documents_stat))
print(str(len(test_docs_stat) + " total test documents"))
# list of categories
categories = reuters.categories()
print(str(len(categories)) + " categories")
# get the documents in a category
category_docs = reuters.fileids("acq")
# words for a document
document_id = category_docs[0]
document_words = reuters.words(category_docs[0])
print(document_words)
# print the raw document
print(reuters.raw(document_id))
def collocations(self, num=20, window_size=2):
"""
Print collocations derived from the text,ignoring stopwords.
:seealso: find_collocations
:param num: The maximum number of collocations to print.
:type num: int
:param window_size: The number of tokens spanned by a collocation (default=2)
:type window_size: int
"""
if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
self._num = num
self._window_size = window_size
#print("Building collocations list")
from nltk.corpus import stopwords
ignored_words = stopwords.words('english')
finder = BigramcollocationFinder.from_words(self.tokens, window_size)
finder.apply_freq_filter(2)
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
bigram_measures = BigramAssocmeasures()
self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations]
print(tokenwrap(colloc_strings, separator="; "))
def get_user_to_word_proportion(user_to_text, word):
"""
Maps each user to the proportion of his words that consist of a specificied
word.
"""
user_to_word_proportion = {}
for user in user_to_text:
lm = LanuageModel(user_to_text[user])
n_tokens = len(lm.lowercase_tokens)
if n_tokens > 0:
fd = nltk.Freqdist(lm.lowercase_tokens)
user_to_word_proportion[user] = fd[word] / float(n_tokens)
else:
user_to_word_proportion[user] = 0.0
print 'Finished user {}'.format(user.encode('utf-8'))
return user_to_word_proportion
def generate(cfd, start_word, n):
word = start_word
words = []
for i in range(n):
words.append(word)
# word = cfd[word].max()
fd = cfd[word]
n_next_words = sum(fd.values())
if n_next_words > 0:
probabilities = [fd[w]/float(n_next_words) for w in sorted(fd.keys())]
word = choice(sorted(fd.keys()), p=probabilities)
else:
# Pick random word
old_word = word
# Todo: use unigram probabilities later
word = choice(cfd.keys())
words.append(word)
sentence = ' '.join(words)
# Todo: modify above for punctuation
return sentence
def rm_stop_words(data, mode="nltk",silent=1):
"""
Input:
data is a set,{} or Counter
"""
if silent==0:
print("remove stop words ...")
if mode == "nltk":
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
else:
print("unkNown mode",mode)
assert 0
if isinstance(data,list):
data = [i for i in data if i.lower() not in stop_words]
return data
else:
for word in stop_words:
if word in data:
del data[word]
def words_to_char_sequence(words_list, tk):
"""Convert words list to chars sequence
# Arguments
words: word list,(sentence_len,word_len)
# Output shape
(sentence_len,MAX_SEQUENCE_LENGTH,MAX_CHAR_PER_WORD)
"""
c_seqs = np.zeros((len(words_list),
TrainConfig.MAX_SEQUENCE_LENGTH,
TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
for w_i in xrange(len(words_list)):
words = words_list[w_i]
fixed_ws = np.zeros((TrainConfig.MAX_SEQUENCE_LENGTH,
TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
ws = tk.texts_to_sequences(words)
ws = pad_sequences(ws, maxlen=TrainConfig.MAX_CHAR_PER_WORD)
if TrainConfig.MAX_SEQUENCE_LENGTH < len(words):
max_word_len = TrainConfig.MAX_SEQUENCE_LENGTH
else:
max_word_len = len(words)
fixed_ws[:max_word_len, :] = ws[:max_word_len, :]
c_seqs[w_i] = fixed_ws
return c_seqs
def tiny_tokenize(text, stem=False, stop_words=[]):
words = []
for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \
text.decode(encoding='UTF-8', errors='ignore'))):
if not token.isdigit() and not token in stop_words:
if stem:
try:
w = Englishstemmer().stem(token)
except Exception as e:
w = token
else:
w = token
words.append(w)
return words
# return [Englishstemmer().stem(token) if stem else token for token in wordpunct_tokenize(
# re.sub('[%s]' % re.escape(string.punctuation),' ',text.decode(encoding='UTF-8',errors='ignore'))) if
# not token.isdigit() and not token in stop_words]
def build_vocab(word_freq, threshold=5, topn=None, start_idx=0):
"""
threshold only take effects when topn is None.
words are indexed by overall frequency in the dataset.
"""
word_freq = sorted(word_freq.iteritems(), key=lambda d:d[1], reverse=True)
if topn:
word_freq = zip(*word_freq[:topn])[0]
vocab_dict = dict(zip(word_freq, range(start_idx, len(word_freq) + start_idx)))
else:
idx = start_idx
vocab_dict = {}
for word, freq in word_freq:
if freq < threshold:
return vocab_dict
vocab_dict[word] = idx
idx += 1
return vocab_dict
def bigrams(words, join_string, skip=0):
"""
Input: a list of words,e.g.,["I","am","Denny"]
Output: a list of bigram,["I_am","am_Denny"]
"""
assert type(words) == list
L = len(words)
if L > 1:
lst = []
for i in range(L - 1):
for k in range(1, skip + 2):
if i + k < L:
lst.append(join_string.join([words[i], words[i + k]]))
else:
# set it as unigram
lst = NgramUtil.unigrams(words)
return lst
def trigrams(words,"Denny"]
Output: a list of trigram,["I_am_Denny"]
"""
assert type(words) == list
L = len(words)
if L > 2:
lst = []
for i in range(L - 2):
for k1 in range(1, skip + 2):
for k2 in range(1, skip + 2):
if i + k1 < L and i + k1 + k2 < L:
lst.append(join_string.join([words[i], words[i + k1], words[i + k1 + k2]]))
else:
# set it as bigram
lst = NgramUtil.bigrams(words, skip)
return lst
def biterms(words, join_string):
"""
Input: a list of words,"Denny","boy"]
Output: a list of biterm,"I_Denny","I_boy","am_Denny","am_boy","Denny_boy"]
"""
assert type(words) == list
L = len(words)
if L > 1:
lst = []
for i in range(L - 1):
for j in range(i + 1, L):
lst.append(join_string.join([words[i], words[j]]))
else:
# set it as uniterm
lst = NgramUtil.uniterms(words)
return lst
def triterms(words,"boy"]
Output: a list of triterm,["I_am_Denny","I_am_boy","I_Denny_boy","am_Denny_boy"]
"""
assert type(words) == list
L = len(words)
if L > 2:
lst = []
for i in xrange(L - 2):
for j in xrange(i + 1, L - 1):
for k in xrange(j + 1, L):
lst.append(join_string.join([words[i], words[j], words[k]]))
else:
# set it as biterm
lst = NgramUtil.biterms(words, join_string)
return lst
def fourterms(words,"boy","ha"]
Output: a list of fourterm,["I_am_Denny_boy","I_am_Denny_ha","I_am_boy_ha","I_Denny_boy_ha","am_Denny_boy_ha"]
"""
assert type(words) == list
L = len(words)
if L > 3:
lst = []
for i in xrange(L - 3):
for j in xrange(i + 1, L - 2):
for k in xrange(j + 1, L - 1):
for l in xrange(k + 1, L):
lst.append(join_string.join([words[i], words[k], words[l]]))
else:
# set it as triterm
lst = NgramUtil.triterms(words, join_string)
return lst
def ngrams(words, ngram, join_string=" "):
"""
wrapper for ngram
"""
if ngram == 1:
return NgramUtil.unigrams(words)
elif ngram == 2:
return NgramUtil.bigrams(words, join_string)
elif ngram == 3:
return NgramUtil.trigrams(words, join_string)
elif ngram == 4:
return NgramUtil.fourgrams(words, join_string)
elif ngram == 12:
unigram = NgramUtil.unigrams(words)
bigram = [x for x in NgramUtil.bigrams(words, join_string) if len(x.split(join_string)) == 2]
return unigram + bigram
elif ngram == 123:
unigram = NgramUtil.unigrams(words)
bigram = [x for x in NgramUtil.bigrams(words, join_string) if len(x.split(join_string)) == 2]
trigram = [x for x in NgramUtil.trigrams(words, join_string) if len(x.split(join_string)) == 3]
return unigram + bigram + trigram
def bigrams(words, words[i + k]]))
else:
# set it as unigram
lst = NgramUtil.unigrams(words)
return lst
def biterms(words, words[j]]))
else:
# set it as uniterm
lst = NgramUtil.uniterms(words)
return lst
def build_vocabulary( words, max_size ):
vocab_instances = 0
unique_counts = Counter(words)
d = dict(unique_counts.most_common(cfg.vocabulary_size-2) )
vocabulary = OrderedDict( sorted(d.items(), key=lambda t: t[1], reverse=True) )
# start at 2 to leave room for padding & unkNown
pb = Progress_bar(len(d) - 1)
for i, (key, value) in enumerate(vocabulary.items(), start=2):
vocab_instances += value
vocabulary[key] = i
pb.tick()
vocabulary[cfg.padding_char] = 0
vocabulary[cfg.placeholder_char] = 1
#reverse the vocbulary (for reverse lookup)
rev_vocabulary = {v: k for k, v in vocabulary.items()}
vocab = (len(unique_counts), vocab_instances, vocabulary, rev_vocabulary)
return vocab
def tokenize_text( sample_text ):
global sequence_lengths
processed_text = []
if cfg.remove_punctuation:
cleaned = sample_text.lower().translate( t_table )
else:
cleaned = sample_text
if cfg.use_casual_tokenizer:
tokens = tknzr.tokenize( cleaned )
else:
tokens = nltk.word_tokenize( cleaned, language='english')
if cfg.remove_stopwords:
tokens = [w for w in tokens if not w in stopwords.words('english')]
sequence_lengths.append( len( tokens ) )
processed_text.extend( tokens )
return processed_text
def _init_(self, min_cut=0.1, max_cut=0.9):
# identation changes - we are inside the constructor
# here we set up the behavIoUr
# this is called each time an object of feq summ class is
# created or instantiated
self._min_cut = min_cut # self=keyword that reports the variable
self._max_cut = max_cut
# we save the val of the 2 parameters passed by assigning them
# two member variables - the 'self.' prefix identifies them as part
# of the self argument - using underscore as first char.
self._stopwords = set(stopwords.words('english') + list(punctuation))
# this is alist of all common words and punc symols
# identation changes - we are out of the constructor here
# This is still the body of the class
# Defining var here ( outside a member function) but within the class
# member var becomes STATIC. This means it belongs to the class,and not
# to any specific individual instance (object) of the class
def extractFeatures(self, article, n, customStopWords=None):
# pass in article as a tuple ( text,title)
text = article[0]
# extract the text
title = article[1]
# extract the title
sentences = sent_tokenize(text)
# split text into sentences
word_sent = [word_tokenize(sentences.lower()) for a in sentences]
# split sentences into words
self._freq = self._compute_frequencies(word_sent, customStopWords)
# calculate word freq using member func created above
if n < 0:
# how many features (words) to return - a -ve number means
# no feature ( word) selection,just return all features
return nlargest(len(self._freq_keys()),
self._freq, key=self._freq.get)
else:
# here we say if calling e func has asked for a subset
# then return only the 'n' largest features,i.e. the
# most important words ( important == frequent,less stopwords)
return nlargest(n, self._freq, key=self._freq.get)
def similarity(c1, c2):
'''stop words are words like "it" and "the",that have no massive impact on the
sentence'''
stop_words = list(stopwords.words("english"))
# Removes stop words in both sentences
c1_cleaned = [x for x in word_tokenize(c1) if x not in stop_words]
c2_cleaned = [x for x in word_tokenize(c2) if x not in stop_words]
c1_words = Counter(dedupe(c1_cleaned))
c2_words = Counter(dedupe(c2_cleaned))
total_words = c1_words + c2_words
similarity_between_words = 0
for key, val in total_words.items():
''' Looks at whether the two articles share a word'''
if total_words[key] > 1:
similarity_between_words += 1
return similarity_between_words / (log(len(c1_words)) + log(len(c2_words)))
def _answer_stop_word_density(self, row):
"""Percentage of tokens in the answer are stopwords
- Args:
row(pandas.dataframe): input row vector
- Returns:
row(pandas.dataframe): ouput vector with new feature
"""
stop = stopwords.words('english')
answer = row.Answer
if answer:
tokens = answer.split()
num_tokens = len(tokens)
stop_word_in_answer = [i for i in tokens if i in stop]
num_stop_word_in_answer = len(stop_word_in_answer)
row['ANSWER_STOPWORD_DENSITY'] = float(
num_stop_word_in_answer) / num_tokens
return row
else:
row['ANSWER_STOPWORD_DENSITY'] = 0
return row
def _answer_quantifier_density(self, row):
"""Percentage of tokens in the answer that are quantifier words
- Args:
row(pandas.dataframe): input pandas dataframe
- Returns:
row(pandas.dataframe): result a pandas dataframe with new feature
"""
answer = row.Answer
if answer:
tokens = answer.split()
answer_len = len(tokens)
quantifier_tokens = [
i for i in tokens if i in ling.QUANTIFIER_WORDS]
quantifier_tokens_len = len(quantifier_tokens)
row['ANSWER_QUANTIFIER_DENSITY'] = float(
quantifier_tokens_len) / answer_len
return row
else:
row['ANSWER_QUANTIFIER_DENSITY'] = 0
return row
def _percentage_capitalized_word_in_answer(self, row):
"""Percentage of capitalized words in the sentence that are in the answer
- Args:
row(pandas.dataframe): input pandas dataframe
- Returns:
row(pandas.dataframe): result a pandas dataframe with new feature
"""
answer = row.Answer
sentence = row.Sentence
if answer is not None and sentence is not None:
tokens = sentence.split()
num_tokens = len(tokens)
cap_tokens = [i for i in tokens if i.isupper() == True]
cap_tokens_in_answer = [i for i in cap_tokens if i in answer]
row['PERCENT_CAPITALIZED_WORDS_IN_ANSWER'] = float(
len(cap_tokens_in_answer)) / num_tokens
return row
else:
row['PERCENT_CAPITALIZED_WORDS_IN_ANSWER'] = 0
return row
def get_pairwise_overlap_features(sent_list_1, sent_list_2, word_to_doc_cnt):
"""
Get overlap,idf weighted overlap,overlap excluding stopwords,and idf weighted overlap excluding stopwords.
"""
stoplist = set(stopwords.words('english'))
num_docs = len(sent_list_1)
overlap_feats = []
for s1, s2 in zip(sent_list_1, sent_list_2):
tokens_a_set, tokens_b_set = set(s1), set(s2)
intersect = tokens_a_set & tokens_b_set
overlap = len(intersect) / (len(tokens_a_set) + len(tokens_b_set))
idf_intersect = sum(np.math.log(num_docs / word_to_doc_cnt[w]) for w in intersect)
idf_weighted_overlap = idf_intersect / (len(tokens_a_set) + len(tokens_b_set))
tokens_a_set_no_stop = set(w for w in s1 if w not in stoplist)
tokens_b_set_no_stop = set(w for w in s2 if w not in stoplist)
intersect_no_stop = tokens_a_set_no_stop & tokens_b_set_no_stop
overlap_no_stop = len(intersect_no_stop) / (len(tokens_a_set_no_stop) + len(tokens_b_set_no_stop))
idf_intersect_no_stop = sum(np.math.log(num_docs / word_to_doc_cnt[w]) for w in intersect_no_stop)
idf_weighted_overlap_no_stop = idf_intersect_no_stop / (len(tokens_a_set_no_stop) + len(tokens_b_set_no_stop))
overlap_feats.append([overlap, idf_weighted_overlap, overlap_no_stop, idf_weighted_overlap_no_stop])
return overlap_feats
def get_similar_documents_for_query(model_id, text):
"""
Return documents similar to the query or an empty set if an error occurs or the query has no words after preprocessing
:param model_id:
:param text:
:return:
"""
model = db_utils.get_model(model_id)
topics_assignment = assign_topics_for_query(model_id, text)
if len(topics_assignment) != 0:
topics_vector = transform_topics_assignment_from_lda_to_vector(model['number_of_topics'], topics_assignment[0])
# print(topics_vector)
return get_similar_documents_by_vector(model_id, topics_vector)
else:
return []
def get_binary(self):
return Pipeline([
('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'), norm='l2', use_idf=True)),
('feat_select', Selectpercentile(percentile=10)),
('clf', OneVsRestClassifier(SGDClassifier(alpha=0.0001,
average=False,
class_weight=None,
epsilon=0.1,
eta0=0.0,
fit_intercept=True,
l1_ratio=0.15,
learning_rate='optimal',
loss='log',
n_iter=10,
n_jobs=1,
penalty='l2',
power_t=0.5,
random_state=None,
shuffle=True,
verbose=0,
warm_start=False
)))
])
def get_sgdc(self):
return Pipeline([
('tfidf', SGDClassifier(alpha=0.0001,
average=False,
class_weight=None,
epsilon=0.1,
eta0=0.0,
fit_intercept=True,
l1_ratio=0.15,
learning_rate='optimal',
loss='log',
n_iter=10,
n_jobs=1,
penalty='l2',
power_t=0.5,
random_state=None,
shuffle=True,
verbose=0,
warm_start=False))
])
def wash(fileList):
# denyPos = ['CC','CD','DT','TO','']
st = Lancasterstemmer()
for f in tqdm(fileList):
fr = open('./washFile/' + f, 'r')
fw = open("./washFile_stem/" + f, 'w')
for line in fr.read().splitlines():
line = remove_punctuation(line).lower()
# wordpos = pos(remove_punctuation(line).lower())
# for turple in wordpos:
# if (turple[0] not in stopwords.words('english')):
# fw.write(turple[0] + ' ')
# fw.write(x + ' ' for x in line.split() if x not in stopwords.words('english'))
# stopw = stopwords.words('english')
words = [x for x in line.split()]
for x in words:
try:
fw.write(st.stem(x) + ' ')
except:
print x
fr.close()
fw.close()
def count_entries(file_list):
"""Performs a count of the number of number of words in the corpus
Args:
file_list (list): list of file names.
Returns:
list: A list of json objects containing the count per file name
"""
result = []
for obj in file_list:
with open(CSV_PATH + obj + '.csv', "r") as entry:
reader = csv.reader(entry, delimiter=",")
col_count = len(reader.next())
res = {"Filename": obj, "Count": col_count}
result.append(res)
return result
def words_to_char_sequence(words_list, :]
c_seqs[w_i] = fixed_ws
return c_seqs
def high_@R_82_4045@ion_words(labelled_words, score_fn=BigramAssocmeasures.chi_sq, min_score=5):
word_fd = Freqdist()
label_word_fd = ConditionalFreqdist()
for label, words in labelled_words:
for word in words:
word_fd[word] += 1
label_word_fd[label][word] += 1
n_xx = label_word_fd.N()
high_info_words = set()
for label in label_word_fd.conditions():
n_xi = label_word_fd[label].N()
word_scores = collections.defaultdict(int)
for word, n_ii in label_word_fd[label].items():
n_ix = word_fd[word]
score = score_fn(n_ii, (n_ix, n_xi), n_xx)
word_scores[word] = score
bestwords = [word for word, score in word_scores.items() if score >= min_score]
high_info_words |= set(bestwords)
return high_info_words
def build_dictionary(sentences, vocabulary_size):
# Turn sentences (list of strings) into lists of words
split_sentences = [s.split() for s in sentences]
words = [x for sublist in split_sentences for x in sublist]
# Initialize list of [word,word_count] for each word,starting with unkNown
count = [['RARE', -1]]
# Now add most frequent words,limited to the N-most frequent (N=vocabulary size)
count.extend(collections.Counter(words).most_common(vocabulary_size-1))
# Now create the dictionary
word_dict = {}
# For each word,that we want in the dictionary,add it,then make it
# the value of the prior dictionary length
for word, word_count in count:
word_dict[word] = len(word_dict)
return(word_dict)
# Turn text data into lists of integers from dictionary
def load_text_vec(alphabet,filename="",embedding_size = 100):
vectors = {}
with open(filename) as f:
i=0
for line in f:
i+=1
if i % 100000 == 0:
print 'epch %d' % i
items = line.strip().split(' ')
if len(items) == 2:
vocab_size, embedding_size= items[0],items[1]
print ( vocab_size, embedding_size)
else:
word = items[0]
if word in alphabet:
vectors[word] = items[1:]
print 'embedding_size',embedding_size
print 'done'
print 'words found in wor2vec embedding ',len(vectors.keys())
return vectors
def add_list_of_words_in_w2v_model(self, unkNown_words):
huge_w2v_model_file = open(self.w2v_huge_model_path, "r")
current_w2v_model_file = open(self.w2v_model_path, "a")
line = huge_w2v_model_file.readline()
unkNown_words_left = len(unkNown_words)
while line and unkNown_words_left:
word = line.split()[0]
if word in unkNown_words:
current_w2v_model_file.write(line)
unkNown_words = unkNown_words - set([word])
unkNown_words_left -= 1
line = huge_w2v_model_file.readline()
for word in list(unkNown_words):
random_position = random(self.w2v_model.vector_size)*2-1
current_w2v_model_file.write(" ".join(([word]+[str(x) for x in random_position])))
print "warning random positions introduced for new words ... in the future this should be solved"
current_w2v_model_file.close()
huge_w2v_model_file.close()
def add_list_of_words_in_w2v_model(self, "a")
line = huge_w2v_model_file.readline()
unkNown_words_left = len(unkNown_words)
while line and unkNown_words_left:
word = line.split()[0]
if word in unkNown_words:
current_w2v_model_file.write(line)
unkNown_words = unkNown_words - set([word])
unkNown_words_left -= 1
line = huge_w2v_model_file.readline()
for word in list(unkNown_words):
random_position = random(self.w2v_model.vector_size)*2-1
current_w2v_model_file.write(" ".join(([word]+[str(x) for x in random_position])))
print "warning random positions introduced for new words ... in the future this should be solved"
current_w2v_model_file.close()
huge_w2v_model_file.close()
def extract_NPs(chunk):
"""
Given chunk [(phrase,phrase_type)],[('the lady','NP'),('with','PP'),'the blue shirt','NP'],
we extract the NPs with stopping and location words filtered out,and return list of noun phrases.
"""
forbid_wds = stop_words + location_words
NPs = []
for phrase, ptype in chunk:
if ptype == 'NP':
filtered_wds = []
for wd in phrase.split():
if wd not in forbid_wds:
filtered_wds += [wd]
if len(' '.join(filtered_wds)) > 0:
NPs += [' '.join(filtered_wds)]
return NPs
def extract_NNs(chunk, pos):
"""
Given chunk [(phrase,
and pos [(word,pos)],[('man','NN')]
we extract from NPs with stopping,location,color,size words filtered out,
and return list of NN words only.
"""
forbid_wds = stop_words + location_words + color_words + size_words
NNs = []
for phrase, ptype in chunk:
if ptype == 'NP':
filtered_wds = []
for wd in phrase.split():
wd_pos = [p[1] for p in pos if p[0] == wd][0]
if wd not in forbid_wds and wd_pos != 'JJ' and wd_pos != 'CD': # we don't need JJ nor CD words neither.
filtered_wds += [wd]
if len(' '.join(filtered_wds)) > 0:
NNs += [' '.join(filtered_wds)]
return NNs
def process_text(self, text):
flags = (UNICODE if sys.version < '3' and type(text) is unicode
else 0)
regexp = self.regexp if self.regexp is not None else r"\w[\w']+"
words = findall(regexp, text, flags)
# remove stopwords
words = [word for word in words]
# remove 's
words = [word[:-2] if word.lower().endswith("'s") else word
for word in words]
# remove numbers
words = [word for word in words if not word.isdigit()]
if self.collocations:
word_counts = unigrams_and_bigrams(words, self.normalize_plurals)
else:
word_counts, _ = process_tokens(words, self.normalize_plurals)
return word_counts
def tokenize(text):
"""
Tokenizes sequences of text and stems the tokens.
:param text: String to tokenize
:return: List with stemmed tokens
"""
tokens = nl.Whitespacetokenizer().tokenize(text)
tokens = list(set(re.sub("[^a-zA-Z\']", "", token) for token in tokens))
tokens = [word for word in tokens if word not in stopwords.words('english')]
tokens = list(set(re.sub("[^a-zA-Z]", token) for token in tokens))
stems = []
stemmer = snowballstemmer("english")
for token in tokens:
token = stemmer.stem(token)
if token != "":
stems.append(token)
return stems
def review_to_wordlist( review, remove_stopwords=False ):
# Function to convert a document to a sequence of words,
# optionally removing stop words. Returns a list of words.
#
# 1. Remove HTML
review_text = BeautifulSoup(review).get_text()
#
# 2. Remove non-letters
review_text = re.sub("[^a-zA-Z]"," ", review_text)
#
# 3. Convert words to lower case and split them
words = review_text.lower().split()
#
# 4. Optionally remove stop words (false by default)
if remove_stopwords:
stops = set(stopwords.words("english"))
words = [w for w in words if not w in stops]
#
# 5. Return a list of words
return(words)
# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
# Function to split a review into parsed sentences. Returns a
# list of sentences,where each sentence is a list of words
#
# 1. Use the NLTK tokenizer to split the paragraph into sentences
raw_sentences = tokenizer.tokenize(review.strip())
#
# 2. Loop over each sentence
sentences = []
for raw_sentence in raw_sentences:
# If a sentence is empty,skip it
if len(raw_sentence) > 0:
# Otherwise,call review_to_wordlist to get a list of words
sentences.append( kaggleWord2VecUtility.review_to_wordlist( raw_sentence, \
remove_stopwords ))
#
# Return the list of sentences (each sentence is a list of words,
# so this returns a list of lists
return sentences
def extract_unigram_feats(document, unigrams, handle_negation=False):
"""
Populate a dictionary of unigram features,reflecting the presence/absence in
the document of each of the tokens in `unigrams`.
:param document: a list of words/tokens.
:param unigrams: a list of words/tokens whose presence/absence has to be
checked in `document`.
:param handle_negation: if `handle_negation == True` apply `mark_negation`
method to `document` before checking for unigram presence/absence.
:return: a dictionary of unigram features {unigram : boolean}.
>>> words = ['ice','police','riot']
>>> document = 'ice is melting due to global warming'.split()
>>> sorted(extract_unigram_feats(document,words).items())
[('contains(ice)',True),('contains(police)',False),('contains(riot)',False)]
"""
features = {}
if handle_negation:
document = mark_negation(document)
for word in unigrams:
features['contains({0})'.format(word)] = word in set(document)
return features
def __init__(self,
w=20,
k=10,
similarity_method=BLOCK_COMPARISON,
stopwords=None,
smoothing_method=DEFAULT_SMOOTHING,
smoothing_width=2,
smoothing_rounds=1,
cutoff_policy=HC,
demo_mode=False):
if stopwords is None:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
self.__dict__.update(locals())
del self.__dict__['self']
def from_words(cls, words, window_size=2):
"""Construct a BigramcollocationFinder for all bigrams in the given
sequence. When window_size > 2,count non-contiguous bigrams,in the
style of Church and Hanks's (1990) association ratio.
"""
wfd = Freqdist()
bfd = Freqdist()
if window_size < 2:
raise ValueError("Specify window_size at least 2")
for window in ngrams(words, window_size, pad_right=True):
w1 = window[0]
if w1 is None:
continue
wfd[w1] += 1
for w2 in window[1:]:
if w2 is not None:
bfd[(w1, w2)] += 1
return cls(wfd, bfd, window_size=window_size)
def from_words(cls, window_size=3):
"""Construct a TrigramcollocationFinder for all trigrams in the given
sequence.
"""
if window_size < 3:
raise ValueError("Specify window_size at least 3")
wfd = Freqdist()
wildfd = Freqdist()
bfd = Freqdist()
tfd = Freqdist()
for window in ngrams(words, pad_right=True):
w1 = window[0]
if w1 is None:
continue
for w2, w3 in _itertools.combinations(window[1:], 2):
wfd[w1] += 1
if w2 is None:
continue
bfd[(w1, w2)] += 1
if w3 is None:
continue
wildfd[(w1, w3)] += 1
tfd[(w1, w2, w3)] += 1
return cls(wfd, wildfd, tfd)
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 [email protected] 举报,一经查实,本站将立刻删除。