Python nltk.corpus.stopwords 模块，words() 实例源码

我们从Python开源项目中，提取了以下50个代码示例，用于说明如何使用nltk.corpus.stopwords.words()。

项目：customer-service-chatbot 作者：xploiter-projects | 项目源码 | 文件源码

def SpeechToText():
        r = sr.Recognizer()   #Speech recognition
        with sr.Microphone() as source:
            print("Say something!")
            audio = r.listen(source)
            message = r.recognize_google(audio)
            print("Check: "+message)
        try:
            print("User: " + r.recognize_google(audio))
        except sr.UnkNownValueError:
            print("Google Speech Recognition Could not understand audio")
        except sr.RequestError as e:
            print("Could not request results from Google Speech Recognition service; {0}".format(e))
        return message

#function to find importance of words to use them to deduce that which thing is being asked more

项目：Natural-Language-Processing-Python-and-NLTK 作者：PacktPublishing | 项目源码 | 文件源码

def preprocessing(text):
    text = text.decode("utf8")
    # tokenize into words
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]

    # remove stopwords
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]

    # remove words less than three letters
    tokens = [word for word in tokens if len(word) >= 3]

    # lower capitalization
    tokens = [word.lower() for word in tokens]

    # lemmatize
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]
    preprocessed_text= ' '.join(tokens)

    return preprocessed_text

项目：ml-projects 作者：saopayne | 项目源码 | 文件源码

def collection_stats():
    # list of documents
    documents_stat = reuters.fileids()
    print(str(len(documents)) + " documents")

    train_docs_stat = list(filter(lambda doc: doc.startswith("train"), documents_stat))
    print(str(len(train_docs_stat)) + " total training documents")

    test_docs_stat = list(filter(lambda doc: doc.startswith("test"), documents_stat))
    print(str(len(test_docs_stat) + " total test documents"))

    # list of categories
    categories = reuters.categories()
    print(str(len(categories)) + " categories")

    # get the documents in a category
    category_docs = reuters.fileids("acq")

    # words for a document
    document_id = category_docs[0]
    document_words = reuters.words(category_docs[0])
    print(document_words)

    # print the raw document
    print(reuters.raw(document_id))

项目：Price-Comparator 作者：Thejas-1 | 项目源码 | 文件源码

def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text,ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
            self._num = num
            self._window_size = window_size

            #print("Building collocations list")
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            finder = BigramcollocationFinder.from_words(self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocmeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations]
        print(tokenwrap(colloc_strings, separator="; "))

项目：facebook-message-analysis 作者：szheng17 | 项目源码 | 文件源码

def get_user_to_word_proportion(user_to_text, word):
    """
    Maps each user to the proportion of his words that consist of a specificied
    word.
    """
    user_to_word_proportion = {}
    for user in user_to_text:
        lm = LanuageModel(user_to_text[user])
        n_tokens = len(lm.lowercase_tokens)
        if n_tokens > 0:
            fd = nltk.Freqdist(lm.lowercase_tokens)
            user_to_word_proportion[user] = fd[word] / float(n_tokens)
        else:
            user_to_word_proportion[user] = 0.0
        print 'Finished user {}'.format(user.encode('utf-8'))
    return user_to_word_proportion

项目：facebook-message-analysis 作者：szheng17 | 项目源码 | 文件源码

def generate(cfd, start_word, n):
        word = start_word
        words = []
        for i in range(n):
            words.append(word)
            # word = cfd[word].max()
            fd = cfd[word]
            n_next_words = sum(fd.values())
            if n_next_words > 0:
                probabilities = [fd[w]/float(n_next_words) for w in sorted(fd.keys())]
                word = choice(sorted(fd.keys()), p=probabilities)
            else:
                # Pick random word
                old_word = word
                # Todo: use unigram probabilities later
                word = choice(cfd.keys())
        words.append(word)
        sentence = ' '.join(words)
        # Todo: modify above for punctuation
        return sentence

项目：kaggle-review 作者：daxiongshu | 项目源码 | 文件源码

def rm_stop_words(data, mode="nltk",silent=1):
    """
    Input:
        data is a set,{} or Counter
    """
    if silent==0:
        print("remove stop words ...")
    if mode == "nltk":
        from nltk.corpus import stopwords
        stop_words = set(stopwords.words('english'))
    else:
        print("unkNown mode",mode)
        assert 0

    if isinstance(data,list):   
        data = [i for i in data if i.lower() not in stop_words]
        return data
    else:
        for word in stop_words:
            if word in data:
                del data[word]

项目：BiMPM_keras 作者：ijinmao | 项目源码 | 文件源码

def words_to_char_sequence(words_list, tk):
    """Convert words list to chars sequence

    # Arguments
        words: word list,(sentence_len,word_len)

    # Output shape
        (sentence_len,MAX_SEQUENCE_LENGTH,MAX_CHAR_PER_WORD)
    """
    c_seqs = np.zeros((len(words_list),
                       TrainConfig.MAX_SEQUENCE_LENGTH,
                       TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
    for w_i in xrange(len(words_list)):
        words = words_list[w_i]
        fixed_ws = np.zeros((TrainConfig.MAX_SEQUENCE_LENGTH,
                             TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
        ws = tk.texts_to_sequences(words)
        ws = pad_sequences(ws, maxlen=TrainConfig.MAX_CHAR_PER_WORD)
        if TrainConfig.MAX_SEQUENCE_LENGTH < len(words):
            max_word_len = TrainConfig.MAX_SEQUENCE_LENGTH
        else:
            max_word_len = len(words)
        fixed_ws[:max_word_len, :] = ws[:max_word_len, :]
        c_seqs[w_i] = fixed_ws
    return c_seqs

项目：KATE 作者：hugochan | 项目源码 | 文件源码

def tiny_tokenize(text, stem=False, stop_words=[]):
    words = []
    for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \
            text.decode(encoding='UTF-8', errors='ignore'))):
        if not token.isdigit() and not token in stop_words:
            if stem:
                try:
                    w = Englishstemmer().stem(token)
                except Exception as e:
                    w = token
            else:
                w = token
            words.append(w)

    return words

    # return [Englishstemmer().stem(token) if stem else token for token in wordpunct_tokenize(
    #                     re.sub('[%s]' % re.escape(string.punctuation),' ',text.decode(encoding='UTF-8',errors='ignore'))) if
    #                     not token.isdigit() and not token in stop_words]

项目：KATE 作者：hugochan | 项目源码 | 文件源码

def build_vocab(word_freq, threshold=5, topn=None, start_idx=0):
    """
    threshold only take effects when topn is None.
    words are indexed by overall frequency in the dataset.
    """
    word_freq = sorted(word_freq.iteritems(), key=lambda d:d[1], reverse=True)
    if topn:
        word_freq = zip(*word_freq[:topn])[0]
        vocab_dict = dict(zip(word_freq, range(start_idx, len(word_freq) + start_idx)))
    else:
        idx = start_idx
        vocab_dict = {}
        for word, freq in word_freq:
            if freq < threshold:
                return vocab_dict
            vocab_dict[word] = idx
            idx += 1
    return vocab_dict