Using Reddit's API for Predicting Comments

In [1]:
#importing packages for making url requests, reading JSON and dataframe manipulation
import pandas as pd
import datetime as dt
import glob
import time
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import  TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import svm, linear_model, datasets
from sklearn.metrics import classification_report

from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
import plotly.offline as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
init_notebook_mode(connected=True)
%matplotlib inline

Load in the the data of scraped results

In [2]:
# initialize empty dataframe
hot_reddit = pd.DataFrame()

#read in each csv file contained in the CSV_Files folder
files = [str(file) for file in glob.glob('./hot_CSV_Files/*.csv')]
for file in files:
    hot_reddit = hot_reddit.append(pd.read_csv(file,low_memory=False),ignore_index=True)
    hot_reddit.drop(columns=['Unnamed: 0'],axis=1,inplace=True)
In [3]:
hot_reddit['time_fetched'] = hot_reddit['time_fetched'].astype("datetime64[s]")
hot_reddit['created_utc'] = hot_reddit['created_utc'].astype("datetime64[s]")
hot_reddit['age'] = hot_reddit['time_fetched'] - hot_reddit['created_utc']
In [4]:
#Columns (features) that have potential to be influencial
keep = ['age',
        'author',
        'created_utc',
        'domain',
        'gilded',
        'num_comments',
        'post_hint',
        'score',
        'time_fetched',
        'stickied',
        'subreddit',
        'subreddit_name_prefixed',
        'subreddit_subscribers',
        'title',
        'ups',
        'id',
        'downs'

       ]
In [5]:
#filter dataframe based on influencial columns
reddit_data = hot_reddit[keep].copy()

#dropping duplicates in scraped data
reddit_data.drop_duplicates(['id'],inplace=True)

#actual reddit comment median
comment_median = reddit_data['num_comments'].median()

#convert age to hours for computation
reddit_data['age_hours'] = reddit_data['age'].dt.seconds
reddit_data['age_hours'] = reddit_data['age_hours'].map(lambda x: x/3600)

#column for above or below median comments
reddit_data['median'] = reddit_data['num_comments'].map(lambda x: 1 if x > comment_median else 0)
In [6]:
age_trace = go.Histogram(x = reddit_data['age_hours'],
                         marker=dict(
                                color='orange')
          )

layout = go.Layout(
    title='Hot Post Age (hours)',
    xaxis=dict(
        title='Hours',    
        )
    )

data=[age_trace]
fig = go.Figure(data=data,layout=layout)

py.iplot(fig)
In [161]:
active_subs = reddit_data.groupby(['subreddit']).size().reset_index(name='count')

top_subs = active_subs[active_subs['count'] > 15].head(10).sort_values(by='count', ascending=True)
In [188]:
sub_trace = go.Bar(x = top_subs['subreddit'],
                   y = top_subs['count'],
                   bardir='h'
                        
          )

layout = go.Layout(
    title='Active Subreddits',
    xaxis=dict(
        title='Number of times in "hot"',    
        )
    )

data=[sub_trace]
fig = go.Figure(data=data,layout=layout)

py.iplot(fig)

Setting up train test split on the data

In [9]:
# vectorizer initialization
cv = CountVectorizer(stop_words='english')

#separate our data into X and y for train_test_split analysis
X_t = pd.get_dummies(reddit_data[['age_hours','subreddit','title']],columns=['subreddit'])


#vectorize "title" and put into a DataFrame
X_train_counts = pd.DataFrame(cv.fit_transform(X_t['title']).todense(), columns = ["t_"+w for w in cv.get_feature_names()], index = X_t.index)


# concatenate the vecotorized titles into the X_t dataset
X = pd.concat([X_t.drop(labels = ['title'], axis=1),X_train_counts],axis=1)
y = reddit_data['median']
In [10]:
#train, test and split for logistic regression
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.30)
In [114]:
# baseline accuracy 
ba = reddit_data['median'].value_counts(normalize=True)

ba.max()
Out[114]:
0.5037865748709123

Training and Predicting our Naive Bayes Multinomial model

In [11]:
nbm = MultinomialNB()
nbm.fit(X_train,y_train)
print(nbm.score(X_train,y_train))
print(nbm.score(X_test,y_test))
0.9149250061470371
0.6110154905335629
In [ ]:
# That didnt do so well, lets try a logistic regression model

Training and Predicting our Logistic Regression model

In [16]:
#instantiate logistic regression and fit to training data, then score on test
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
print(logmodel.score(X_train,y_train))
print(logmodel.score(X_test,y_test))
0.9645930661421195
0.7062535857716581
In [131]:
#cross validate logistic model with 5 folds
logcv = cross_val_score(logmodel, X, y, cv=5, scoring='accuracy').mean()
In [134]:
logcv
Out[134]:
0.7306454158070506
In [202]:
lr_pred = logmodel.predict(X_test)
print(classification_report(y_test, lr_pred))
             precision    recall  f1-score   support

          0       0.52      1.00      0.68       905
          1       1.00      0.01      0.01       838

avg / total       0.75      0.52      0.36      1743

In [133]:
#extracting the coefficients for analysis
coef = pd.DataFrame(logmodel.coef_, columns=X_test.columns).T

coef.sort_values(by=0, ascending=False,inplace=True)
In [189]:
top_coef = coef.head(10)             #top 10 coefs
bot_coef = coef.tail(10)             #bottom 10 coefs
all_coef = top_coef.append(bot_coef) # combine top and bottom coefs into a df
In [190]:
#graph coefs for visualization
top_trace = go.Bar(x = top_coef.index,
                  y = top_coef[0],
                   name = 'Top'
          )
bot_trace = go.Bar(x = bot_coef.index,
                  y = bot_coef[0],
                   name = 'Bottom'
                  )
layout = go.Layout(
    title='LR Important Coefficients',
    xaxis=dict(
        title='Correlation',
            
        )
    )

data=[top_trace]
fig = go.Figure(data=data,layout=layout)

py.iplot(fig)
In [101]:
top_coef # best coefs (all subreddits, not text, interesting)
Out[101]:
0
subreddit_trippinthroughtime 2.034494
subreddit_CrappyDesign 1.816297
subreddit_comics 1.687779
subreddit_mildlyinteresting 1.563330
subreddit_oddlysatisfying 1.469566
subreddit_BikiniBottomTwitter 1.444860
subreddit_gifs 1.412162
subreddit_Art 1.408516
subreddit_PeopleFuckingDying 1.390519
subreddit_wholesomememes 1.380903
In [130]:
#Extracting top words
lr_words = []
for word in coef.index[0:50]:   
    if "subreddit_" not in word:
        lr_words.append(word)
#showing top 10 words
coef.T[lr_words].T.head(10)
Out[130]:
0
t_2meirl4meirl 1.190842
t_coming 1.139918
t_hit 1.026354
t_sounds 0.981465
t_guy 0.961954
t_blind 0.959995
t_hmmm 0.957740
t_couple 0.943685
t_simpsons 0.904151
t_ball 0.879237

Random Forest using only Subreddits

In [19]:
from sklearn.ensemble import RandomForestClassifier
In [205]:
target = reddit_data['median']
data = pd.get_dummies(reddit_data['subreddit'])
In [206]:
data_train, data_test, target_train, target_test = train_test_split(data, 
                                                    target, 
                                                    test_size=0.30, 
                                                    random_state=101)
In [207]:
%%time
#using only subreddit
rfc = RandomForestClassifier(n_estimators=600)

rfc.fit(data_train,target_train)
print(rfc.score(data_train,target_train))
print(rfc.score(data_test,target_test))
0.8507499385296288
0.7200229489386116
CPU times: user 3min 36s, sys: 1.51 s, total: 3min 37s
Wall time: 3min 40s
In [208]:
rf_sub_pred = rfc.predict(data_test)
print(classification_report(target_test,rf_sub_pred))
             precision    recall  f1-score   support

          0       0.70      0.76      0.73       871
          1       0.74      0.68      0.71       872

avg / total       0.72      0.72      0.72      1743

In [209]:
#create series of coefs
rfc_imp = pd.Series(rfc.feature_importances_,index=data_test.columns).sort_values(ascending=False)
In [210]:
#chart top coefs
top_trace = go.Bar(x = rfc_imp.head(10).index,
                  y = rfc_imp.head(10),
                   name = 'Top 10'
          )
layout = go.Layout(
    title='Random Forest Important Coefficients',
    xaxis=dict(
        title='Correlation',
            
        )
    )

data=[top_trace]
fig = go.Figure(data=data,layout=layout)

py.iplot(fig)
In [211]:
#See top coefs
pd.DataFrame(rfc_imp).head(10)
Out[211]:
0
funny 0.009231
aww 0.007642
gaming 0.007456
gifs 0.007246
todayilearned 0.007052
FortNiteBR 0.006910
mildlyinteresting 0.006640
pics 0.005800
BlackPeopleTwitter 0.005629
videos 0.005244

Random Forest using multiple features

In [27]:
%%time
#using age, subreddit, title
rfc_2 = RandomForestClassifier(n_estimators=600)

rfc_2.fit(X_train,y_train)
print(rfc_2.score(X_train,y_train))
print(rfc_2.score(X_test,y_test))
1.0
0.6919104991394148
CPU times: user 2min 6s, sys: 812 ms, total: 2min 7s
Wall time: 2min 7s
In [212]:
rfc_all_pred = rfc_2.predict(X_test)
print(classification_report(y_test,rfc_all_pred))
             precision    recall  f1-score   support

          0       0.70      0.72      0.71       905
          1       0.69      0.66      0.67       838

avg / total       0.69      0.69      0.69      1743

In [30]:
#using "title", "age" and "subreddit" as features
top_trace = go.Bar(x = rfc_2_imp.head(10).index.values,
                  y = rfc_2_imp.head(10).values,
                   name = "Top 10"
                   
          )
layout = go.Layout(
    title='Random Forest Important Features',
    xaxis=dict(
        title='Feature Importance Score',
            
        )
    )

data=[top_trace]
fig = go.Figure(data=data,layout=layout)

py.iplot(fig)
In [194]:
#create a series with the best coefs
rfc_2_imp = pd.Series(rfc_2.feature_importances_,index=X_test.columns).sort_values(ascending=False)

rfc_2_words = []
for word in rfc_2_imp.index[0:50]:   
    if "subreddit_" not in word:
        rfc_2_words.append(word)

rfc_2_words = rfc_2_words[1:]

#top 10 words from Random Forest
pd.DataFrame(rfc_2_imp.T[rfc_2_words].head(10))
Out[194]:
0
t_people 0.002177
t_official 0.002116
t_trailer 0.001878
t_roseanne 0.001694
t_76 0.001410
t_cat 0.001401
t_games 0.001342
t_just 0.001217
t_little 0.001168
t_til 0.001142
In [199]:
print(f'The first scrape was at: {reddit_data["created_utc"].min()}')
print(f'The last scrape was at: {reddit_data["created_utc"].max()}')
print(f'The median number of comments was {comment_median}')
The first scrape was at: 2018-05-29 18:34:50
The last scrape was at: 2018-06-01 21:33:35
The median number of comments was 36.0

Project purpose:

Determine if it is possible to predict a Reddit post garnering above the median number of comments. The median was calculated from the total number of comments scraped on posts from May 29th to June 1st.

Utilizing Machine learning along with the Python requests library. A model was built to ascertain if a combination of title and sub-Reddit posting location could gain more than the median number of comments (36 comments).

Model results showed that a post could be inferred to meet the target with 73% accuracy

Determining Karma (score) based on post content

In [217]:
#separate our data into X and y for train_test_split analysis
reddit_data['high_score'] = reddit_data['score'] >= 15000
data = pd.get_dummies(reddit_data[['subreddit','title','num_comments']],columns=['subreddit'])


# #vectorize "title" and put into a DataFrame
X_counts = pd.DataFrame(cv.fit_transform(data['title']).todense(), columns = ["t_"+w for w in cv.get_feature_names()], index = data.index)


# concatenate the vecotorized titles into the X_t dataset
data = pd.concat([data.drop(labels = ['title'], axis=1),X_counts],axis=1)
karma = reddit_data['high_score']
In [219]:
data_train, data_test, target_train, target_test = train_test_split(data,
                                                                    karma,
                                                                    test_size=0.30, 
                                                                    random_state=101)
In [220]:
#instantiate logistic regression and fit to training data, then score on test
logmodel = LogisticRegression()
logmodel.fit(data_train,target_train)
print(logmodel.score(data_train,target_train))
print(logmodel.score(data_test,target_test))
0.9759036144578314
0.9541021227768216
In [222]:
lr_score_pred = logmodel.predict(data_test)
print(classification_report(target_test,lr_score_pred))
             precision    recall  f1-score   support

      False       0.96      1.00      0.98      1637
       True       0.84      0.30      0.44       106

avg / total       0.95      0.95      0.94      1743

In [93]:
#extracting the coefficients for analysis
coef = pd.DataFrame(logmodel.coef_, columns=X_test.columns).T

coef.sort_values(by=0, ascending=False,inplace=True)

top_coef = coef.head(10)             #top 10 coefs
top_coef.sort_index()
bot_coef = coef.tail(10)             #bottom 10 coefs
all_coef = top_coef.append(bot_coef) # combine top and bottom coefs into a df

#graph coefs for visualization
top_trace = go.Bar(x = top_coef.index,
                  y = top_coef[0],
                   name = 'Top'
          )
bot_trace = go.Bar(x = bot_coef.index,
                  y = bot_coef[0],
                   name = 'Bottom'
                  )
layout = go.Layout(
    title='Logistic Regression Score Coefs',
    xaxis=dict(
        title='Correlation',
            
        )
    )

data=[top_trace,bot_trace]
fig = go.Figure(data=data,layout=layout)

py.iplot(fig, filename = 'LR_Score_Coefs')
In [50]:
print(top_coef)
                                      0
subreddit_trippinthroughtime   2.034494
subreddit_CrappyDesign         1.816297
subreddit_comics               1.687779
subreddit_mildlyinteresting    1.563330
subreddit_oddlysatisfying      1.469566
subreddit_BikiniBottomTwitter  1.444860
subreddit_gifs                 1.412162
subreddit_Art                  1.408516
subreddit_PeopleFuckingDying   1.390519
subreddit_wholesomememes       1.380903
In [85]:
#getting the most used words out of the dataset
top_words = X_counts.sum()
top_words = top_words[top_words >= 50].sort_values(ascending=False)
In [96]:
words = go.Bar(x = top_words.index,
               y=top_words,
              bardir = 'h')
layout = go.Layout(title='Most Used Words',
                   xaxis = dict(
                   title = 'Word Count'),
                   yaxis = dict(
                   title = 'Word Used')
                   
                  )

data = [words]
fig = go.Figure(data = data, layout=layout)
py.iplot(fig)

Executive Summary

In [200]:
print(f'The first scrape was at: {reddit_data["created_utc"].min()}')
print(f'The last scrape was at: {reddit_data["created_utc"].max()}')
print(f'The median number of comments was {comment_median}')
The first scrape was at: 2018-05-29 18:34:50
The last scrape was at: 2018-06-01 21:33:35
The median number of comments was 36.0

Project Summary

Initial Project Scope:

Determine if it is possible to predict if a Reddit post can garner above the median number of comments. The median was calculated from the total number of comments scraped on posts from May 29th to June 1st.

Secondary Project Scope:

Build a model to predict a posts likelihood to gain a score over 15,000. This would make the post 'Hot' by Reddit standards.

Project Description

Creating a dataset derived from live web data: Reddit.com Utilizing Machine learning along with the Python requests library, multiple models were built and tested to the scopes. A Logistic Regression Model showed that a post could be inferred to meet the target amount of comments with 73% accuracy. This fulfilled scope #1.

Using the same data, a Logistic Regression model was then used again to model a post score greater than the target. This model was able to reach 95.4% accuracy of predicting a posts score based on the subreddit and title words.