python basics - string manipulation¶

import text list from json file¶

import json
list_name = json.load(open('file_name'))

remove punctuation¶

import string
def remove_punctuation(text):
    return text.translate(None, string.punctuation)
df['col_name'] = df['col_name'].fillna('')
df['col_name_clean'] = df['col_name'].apply(remove_punctuation)

build word count vector¶

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b') #preserve one letter words
vectorizer_vocabulaty = CountVectorizer(vocabulary = word_list)
feature_matrix = vectorizer.fit(text_list)

sorted word count algorithem¶

def count_word(s, n):

    all_words = s.split() #break the strings into words
    words_list = sorted(set(all_words)) #removing duplicates and sort in alphabetical order

    words_count_list = []
    for each_unique_word in words_list:
        unique_word_count = all_words.count(each_unique_word)
        words_count_list.append([each_unique_word, unique_word_count])

    sortedWCL = sorted(words_count_list, key = lambda x: x[1], reverse = True)#sort it

    return sortedWCL[:n]#return top N elements the sorted list