In [2]:
import string
import numpy as np
import pandas as pd
import re
import json
import sys
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdout = stdout

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.preprocessing import normalize
from sklearn.decomposition import TruncatedSVD


# import jupyternotify
# ip = get_ipython()
# ip.register_magics(jupyternotify.JupyterNotifyMagics)
In [ ]:
df = pd.read_csv("all.tsv", sep="\t")
In [ ]:
df_dict = {}
    
for i in range(1985,2018):
    df_dict[i] = (globals()['df'])[(globals()['df'])['year']==str(i)]
In [ ]:
for i in range(1985, 2018):
    df_dict[i]['text'].tolist()[0]
    df_dict[i].text = df_dict[i].text.apply(lambda x: x.lower())
    s = df_dict[i].text.str.split("dear abby: ").apply(pd.Series, 1).stack()
    s.index = s.index.droplevel(-1)
    s.name = 'text'
    del df_dict[i]['text']
    df_dict[i] = df_dict[i].join(s)
    df_dict[i] = (df_dict[i])[(df_dict[i]['text']!='')]
In [ ]:
df_targetuxpress = df_dict[1985].copy(deep=True)

for i in range(1986, 2018):
    df_targetuxpress= df_targetuxpress.append(df_dict[i])
In [ ]:
df_targetuxpress = df_targetuxpress.reset_index()
del df_targetuxpress['index']
df_targetuxpress = df_targetuxpress.reset_index()
df_targetuxpress['index'] = df_targetuxpress.index.map(lambda x: str(x))
df_targetuxpress['title_new']=df_targetuxpress['title']+df_targetuxpress['index']
In [ ]:
df_targetuxpress['question_only'], df_targetuxpress['answer'] = df_targetuxpress['text'].str.split('\ndear ', 1).str
# df_targetuxpress = df_targetuxpress[df_targetuxpress['question_only'].str.contains("dear abby: ")]
# df_targetuxpress = df_targetuxpress[(df_targetuxpress['question_only'].str.contains("alcohol")) | (df_targetuxpress['question_only'].str.contains("wine")) | (df_targetuxpress['question_only'].str.contains("beer") ) | (df_targetuxpress['question_only'].str.contains("whiskey")) | (df_targetuxpress['question_only'].str.contains("tequile")) | (df_targetuxpress['question_only'].str.contains("vodka")) | (df_targetuxpress['question_only'].str.contains("liquor"))]
# df_targetuxpress = df_targetuxpress[df_targetuxpress['question_only'].str.contains("y wife")]
df_targetuxpress = df_targetuxpress[(df_targetuxpress['question_only'].str.contains("y boss")) | (df_targetuxpress['question_only'].str.contains("y supervisor"))]
df_targetuxpress = df_targetuxpress[df_targetuxpress['question_only'].str.contains("\?")]
In [ ]:
# Creating lists of questions + their relevant titles
df_targetuxpress_questions =  df_targetuxpress['question_only'].tolist()
df_targetuxpress_title =  df_targetuxpress['title_new'].tolist()


# Adding these to a dictionary
tempDict = {}
for title, question in zip(df_targetuxpress_title, df_targetuxpress_questions):
    tempDict[title]=question
In [ ]:
# Cleaning the data  
wordnet_lemmatizer = WordNetLemmatizer()

def tokenize(text):
    text = text.lower() # lower case
    for e in set(string.punctuation+'\n'+'\t'): # remove punctuation and line breaks/tabs
        text = text.replace(e, ' ')	
    for i in range(0,10):	# remove double spaces
        text = text.replace('  ', ' ')
    text = text.translate(string.punctuation)  # punctuation
    tokens = nltk.word_tokenize(text)
    text = [w for w in tokens if not w in stopwords.words('english')] # stopwords
    stems = []
    for item in tokens: # stem
#         stems.append(PorterStemmer().stem(item))
        stems.append(wordnet_lemmatizer.lemmatize(item))
    return stems

# calculate tfidf (might take a while)
print "calculating tf-idf"
# tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english', min_df=0.005)
# tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english', min_df=0.01)
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english', min_df=0.025, max_df=.5) #this step takes longest
# tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english', min_df=0.1)
tfs = tfidf.fit_transform(tempDict.values())
print "reducing tf-idf to 500 dim"
# tfs_reduced = TruncatedSVD(n_components=300, random_state=0).fit_transform(tfs)
tfs_reduced = TruncatedSVD(n_components=395, random_state=0).fit_transform(tfs)
print "done"
In [ ]:
model = TSNE(n_components=2, perplexity=100, verbose=2, method='exact').fit_transform(tfs_reduced)

# save to json file
x_axis=model[:,0]
y_axis=model[:,1]
x_norm = (x_axis-np.min(x_axis)) / (np.max(x_axis) - np.min(x_axis))
y_norm = (y_axis-np.min(y_axis)) / (np.max(y_axis) - np.min(y_axis))
data = {"x":x_norm.tolist(), "y":y_norm.tolist(), "names":tempDict.keys()}
# with open('da_q_all_husband_min025_max_5_pp100_lemmed.json', 'w') as outfile:
with open('da_q_boss_min025_max_5_pp100_lemmed.json', 'w') as outfile:
    json.dump(data, outfile)
In [ ]:
# df_xyplot = pd.read_json("da_q_all_husband_min025_max_5_pp100_lemmed.json")
df_xyplot = pd.read_json("da_q_boss_min025_max_5_pp100_lemmed.json")
In [ ]:
result = pd.merge(df_targetuxpress, df_xyplot, left_on='title_new', right_on='names')
# result.to_csv("da_q_all_husband_1_min025_max_5_pp100_lemmed.csv")
result.to_csv("da_q_boss_1_min025_max_5_pp100_lemmed.csv")
In [ ]:
 
In [ ]:
 
In [ ]: