Skip to content

Latest commit

 

History

History
1342 lines (1210 loc) · 27.2 KB

kMeans.md

File metadata and controls

1342 lines (1210 loc) · 27.2 KB
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.cluster import KMeans
# Use this Dataframe to narrow down users who have written reviews for restaurants
filename = 'yelp_review.csv'
filepath = os.path.join('~','homework','yelp-dataset',filename)
reviews_data_df = pd.read_csv(filepath)
filename = 'restaurants.csv'
filepath = os.path.join(filename)
restaurant_data_df = pd.read_csv(filepath)
restaurant_data_df.rename(columns={'stars':'overall_stars'},inplace=True)
business_and_review = restaurant_data_df.merge(reviews_data_df,on='business_id')
business_and_review.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
business_id name neighborhood address city state postal_code latitude longitude overall_stars ... is_open categories review_id user_id stars date text useful funny cool
0 PfOCPjBrlQAnz__NXj9h_w "Brick House Tavern + Tap" NaN "581 Howe Ave" Cuyahoga Falls OH 44221 41.119535 -81.47569 3.5 ... 1 American (New);Nightlife;Bars;Sandwiches;Ameri... rnvu74aNIPn93Z9Ue11Epw bgtr7jOlALe8ay4g7pnf4A 4 2016-05-05 I've always enjoyed my time at brick house foo... 0 0 0
1 PfOCPjBrlQAnz__NXj9h_w "Brick House Tavern + Tap" NaN "581 Howe Ave" Cuyahoga Falls OH 44221 41.119535 -81.47569 3.5 ... 1 American (New);Nightlife;Bars;Sandwiches;Ameri... OE5dH664oSSK_-TvIl263w ndZOHkG_FWdL4TRd9xRfNQ 2 2017-03-19 1st time here. Came w my Unc bc Louies was clo... 0 0 0
2 PfOCPjBrlQAnz__NXj9h_w "Brick House Tavern + Tap" NaN "581 Howe Ave" Cuyahoga Falls OH 44221 41.119535 -81.47569 3.5 ... 1 American (New);Nightlife;Bars;Sandwiches;Ameri... SWeHmo8uHHWbbv94VdAMeQ UqtE83J-6NMQNNJzpnWwEQ 1 2015-08-31 Worse service ever andI use to be a server so ... 0 0 0
3 PfOCPjBrlQAnz__NXj9h_w "Brick House Tavern + Tap" NaN "581 Howe Ave" Cuyahoga Falls OH 44221 41.119535 -81.47569 3.5 ... 1 American (New);Nightlife;Bars;Sandwiches;Ameri... fCCL0pAgYGccUs8shHHIUA PixAmdRa58XZ3hr8YLS36g 5 2015-10-09 I am updating my review to 5-stars because I t... 2 0 2
4 PfOCPjBrlQAnz__NXj9h_w "Brick House Tavern + Tap" NaN "581 Howe Ave" Cuyahoga Falls OH 44221 41.119535 -81.47569 3.5 ... 1 American (New);Nightlife;Bars;Sandwiches;Ameri... -VOh0NNnIo1r24hjwSZL4Q Hly0NEWXMxbuXa2zm_NJOQ 4 2015-01-15 I enjoyed this place. I went the night the Bu... 0 0 0

5 rows × 21 columns

# Rating distributions normalized by total reviews written
filename= 'restaurant_reviewers_pct.xlsx'
filepath = os.path.join(filename)
restaurant_reviews_pct = pd.read_excel(filepath,header=1)
restaurant_reviews_pct.fillna(0,inplace=True)
restaurant_reviews_pct.set_index(['User Id'],inplace=True)
restaurant_reviews_pct.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
1 2 3 4 5
User Id
0_0-eY7lxC-2DgCATusr-g 0.0 1.0 0.0 0.0 0.0
0_0Lg3MMEi9NAGztLdL52Q 0.0 0.0 0.0 0.0 1.0
0_0r9eJyfLs87vFLaXIsRg 0.0 0.0 0.0 1.0 0.0
0_0u-ggFJhigAwWuvok8Zg 0.0 0.0 0.0 0.0 1.0
0_0YL1uj3Ds_ZFdPt9wx4w 0.0 0.0 0.0 1.0 0.0
# Raw counts of Ratings associated with reviews
filename= 'restaurant_reviewers_cnt.xlsx'
filepath = os.path.join(filename)
restaurant_reviews_cnt = pd.read_excel(filepath,header=1)
restaurant_reviews_cnt.fillna(0,inplace=True)
restaurant_reviews_cnt.set_index(['User Id'],inplace=True)
restaurant_reviews_cnt.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
1 2 3 4 5
User Id
0_0-eY7lxC-2DgCATusr-g 0.0 1.0 0.0 0.0 0.0
0_0Lg3MMEi9NAGztLdL52Q 0.0 0.0 0.0 0.0 1.0
0_0r9eJyfLs87vFLaXIsRg 0.0 0.0 0.0 1.0 0.0
0_0u-ggFJhigAwWuvok8Zg 0.0 0.0 0.0 0.0 1.0
0_0YL1uj3Ds_ZFdPt9wx4w 0.0 0.0 0.0 1.0 0.0
# Statistical metrics of rating distributions
filename = 'rating_description_per_user_alt.csv'
filepath = os.path.join(filename)
rating_desc = pd.read_csv(filepath)
rating_desc.drop('mode',axis=1,inplace=True)
rating_desc.fillna(0,inplace=True)
rating_desc.set_index('user_id',inplace=True)
rating_desc.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
10th_quantile 25th_quantile 75th_quantile 90th_quantile max mean_avg med min review_count std var
user_id
---1lKK3aKOuomHnwAkAow 1.0 3.00 5.00 5.0 5 3.861111 4.5 1 72 1.466101 2.149452
---PLwSf5gKdIoVnyRHgBA 3.0 3.00 3.00 3.0 3 3.000000 3.0 3 1 0.000000 0.000000
---cu1hq55BP9DWVXXKHZg 1.3 1.75 3.25 3.7 4 2.500000 2.5 1 2 2.121320 4.500000
---udAKDsn0yQXmzbWQNSw 4.1 4.25 4.75 4.9 5 4.500000 4.5 4 2 0.707107 0.500000
--0RtXvcOIE4XbErYca6Rw 4.0 4.00 4.00 4.0 4 4.000000 4.0 4 1 0.000000 0.000000
# Create Derived descriptors DataFrame
temp_1 = restaurant_reviews_pct.merge(rating_desc,left_index=True,right_index=True)
def round_star(metric):
    return int(np.round(metric))
# min to med slope
def min_med(x):
    med_star = int(np.floor(x['med']))
    min_star = int(np.round(x['min']))
    rise = x[str(med_star)] - x[str(min_star)]
    run = med_star - min_star
    if run == 0:
        return 0
    else:
        return rise/run
temp_1['min_med_slope'] = temp_1.apply(min_med,axis=1)
# med to max slope
def med_max(x):
    med_star = int(np.ceil(x['med']))
    max_star = int(np.round(x['max']))
    rise =  x[str(max_star)] - x[str(med_star)]
    run = max_star - med_star
    if run == 0:
        return 0
    else:
        return rise/run
temp_1['med_max_slope'] = temp_1.apply(med_max,axis=1)
temp_1['mean-med'] = temp_1.apply(lambda x: np.round(x['mean_avg']*2)/2-x['med'],axis=1)
temp_1['inflection'] = temp_1.apply(lambda x: x['min_med_slope']*x['med_max_slope'],axis=1)
derived = temp_1[['mean-med','min_med_slope','med_max_slope','inflection']]
derived.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
mean-med min_med_slope med_max_slope inflection
0_0-eY7lxC-2DgCATusr-g 0.0 0.0 0.0 0.0
0_0Lg3MMEi9NAGztLdL52Q 0.0 0.0 0.0 0.0
0_0r9eJyfLs87vFLaXIsRg 0.0 0.0 0.0 0.0
0_0u-ggFJhigAwWuvok8Zg 0.0 0.0 0.0 0.0
0_0YL1uj3Ds_ZFdPt9wx4w 0.0 0.0 0.0 0.0
def plot_dist(user,show=True):
    data = business_and_review.loc[business_and_review['user_id'] == user]['stars']
    plt.hist(list(data),bins=[x+0.5 for x in range(6)],align='mid',width=0.9)
    plt.title(user)
    if show:
        plt.show()
def normalize(input_list):
    if (np.min(input_list) == 0) & (np.max(input_list)==1):
        return input_list
    else:
        ref = np.max(input_list) - np.min(input_list)
        return (input_list - np.min(input_list))/ref
restaurant_reviews_cnt.index.nunique()
960561
# source = restaurant_reviews_cnt
# source = restaurant_reviews_pct
# source = rating_desc
# source = restaurant_reviews.merge(rating_desc,left_index=True,right_index=True)
# source = restaurant_reviews_pct.merge(
#     restaurant_reviews_cnt,left_index=True,right_index=True).merge(
#     rating_desc[[
#         'review_count','10th_quantile','90th_quantile','mean_avg','var','med'
#     ]],left_index=True,right_index=True)
source = restaurant_reviews_pct.merge(
    rating_desc[[
        'review_count'
    ]],left_index=True,right_index=True).merge(derived,left_index=True,right_index=True)
# source = derived
source.index.nunique()
960560
# Other Filtering Steps
if True:
    # Remove low-count reviewers 
    source = source[source['review_count'] >= 10]
source.index.nunique()
64986
if False:
    # Separate Single-Rating reviewers (reviewers who predominantly give out the same rating)
    monotony_threshold = 0.8
    # source = source[source.apply(lambda x: x[['1_x','2_x','3_x','4_x','5_x']].max() < monotony_threshold,axis=1)]
    source = source[source.apply(lambda x: x[['1','2','3','4','5']].max() < monotony_threshold,axis=1)]
source.index.nunique()
source.apply(normalize,axis=0).head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
1 2 3 4 5 review_count mean-med min_med_slope med_max_slope inflection
0_6zmGnyHQrls5sJdKqudQ 0.000000 0.000000 0.295455 0.090909 0.636364 0.000337 0.375 0.466999 0.640000 0.838762
0_9HCOdSBSrgeboN_pDcgA 0.105263 0.000000 0.057018 0.210526 0.631579 0.003034 0.375 0.432588 0.640000 0.838762
0_71XZqxNetnJjJIINQCOg 0.000000 0.227273 0.196970 0.272727 0.363636 0.000337 0.500 0.373599 0.705455 0.843559
0_aerl3cE66GEmqiAh9_aA 0.000000 0.125000 0.000000 0.100000 0.800000 0.000000 0.375 0.502283 0.640000 0.838762
0_Cfb4ZEG2mQY9C2ACHnbA 0.062500 0.195312 0.270833 0.343750 0.187500 0.007417 0.375 0.406678 0.527500 0.821756
data = source.apply(normalize,axis=0).values.tolist()
# graphically show Sum of Squares Error
sse = []
try_k = 15
for k in range(1, try_k+1):
#     print(f"k = {k}")
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(data)
    sse.append(kmeans.inertia_)

plt.plot(range(1, try_k+1), sse)
plt.title('Sum of Squares Error vs. N Clusters')
plt.xlabel('N Clusters')
plt.ylabel('SSE')
plt.show()

png

# Automate selection of N Clusters
select_threshold = 10
gain = []
for i in range(1,try_k):
    this_gain = 100*(sse[i-1]-sse[i])/sse[i-1]
    gain.append(this_gain)
    if this_gain <= select_threshold:
        print(f'select k = {i}')
        n_clusters = i
        break
select k = 5
kmeans = KMeans(n_clusters=n_clusters)
# kmeans = KMeans(n_clusters=6)
kmeans.fit(data)

# Predict the clusters
predicted_clusters = kmeans.predict(data)

prediction = []
for i,user in enumerate(source.index):
    prediction.append(predicted_clusters[i])
k_means_output = pd.DataFrame({'user_id':list(source.index),
             'prediction':prediction})

output_context = source.merge(k_means_output,left_index=True,right_on='user_id')
output_context.groupby('prediction')['user_id'].count()
prediction
0    12666
1    13619
2     6542
3    18843
4    13316
Name: user_id, dtype: int64
# Spot check predictions
sample = 10
plt.figure(figsize = [20,10])
counter = 1
# for i in range(6):
for i in range(n_clusters):
    sample_users = np.random.choice(output_context[output_context['prediction']==i]['user_id'],sample)
    for j,this_user in enumerate(sample_users):
        plt.subplot(n_clusters,sample,counter)
#         this_user = np.random.choice(output_context[output_context['prediction']==i]['user_id'],1)[0]
        plot_dist(this_user,show=False)
        plt.title(f'{i}, {this_user[:5]}')
        counter+=1
plt.show()

png

output_context.groupby('prediction').head().sort_values('prediction')
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
1 2 3 4 5 review_count mean-med min_med_slope med_max_slope inflection prediction user_id
12 0.000000 0.052632 0.105263 0.526316 0.315789 19 0.0 0.236842 -0.210526 -0.049861 0 0_NbnQy2AwOUDtDUr4uDTg
32 0.044444 0.044444 0.133333 0.488889 0.288889 45 0.0 0.148148 -0.200000 -0.029630 0 0A9i7xt77g1PYS7j219XcQ
10 0.037037 0.148148 0.111111 0.592593 0.111111 27 -0.5 0.185185 -0.481481 -0.089163 0 0_J-edyZgAkDPHjpZKmiMg
9 0.000000 0.050633 0.240506 0.569620 0.139241 79 0.0 0.259494 -0.430380 -0.111681 0 0_iROv6FL4KCveV916IpTA
5 0.000000 0.090909 0.090909 0.545455 0.272727 11 0.0 0.227273 -0.272727 -0.061983 0 0_dLB7aYv_yCRg13g3cs0g
11 0.000000 0.000000 0.090909 0.090909 0.818182 11 -0.5 0.363636 0.000000 0.000000 1 0_m_fXlRXxGFKUMkVo2nUg
8 0.000000 0.000000 0.100000 0.300000 0.600000 10 -0.5 0.250000 0.000000 0.000000 1 0_ImqV5kJJC01CBX7uk8cQ
0 0.000000 0.000000 0.272727 0.090909 0.636364 11 -0.5 0.181818 0.000000 0.000000 1 0_6zmGnyHQrls5sJdKqudQ
1 0.105263 0.000000 0.052632 0.210526 0.631579 19 -0.5 0.131579 0.000000 0.000000 1 0_9HCOdSBSrgeboN_pDcgA
3 0.000000 0.100000 0.000000 0.100000 0.800000 10 -0.5 0.233333 0.000000 0.000000 1 0_aerl3cE66GEmqiAh9_aA
47 0.400000 0.100000 0.000000 0.000000 0.500000 10 -0.5 -0.200000 0.500000 -0.100000 2 0AKk4nxIt0rlbyylj7s73Q
30 0.190476 0.285714 0.095238 0.190476 0.238095 21 0.0 -0.047619 0.071429 -0.003401 2 0A5Bjqb3dLj6NfsBHzDuXA
44 0.333333 0.047619 0.142857 0.190476 0.285714 21 0.0 -0.095238 0.071429 -0.006803 2 0aJBU9Jv15xA3MA5HZ0gjg
26 0.700000 0.300000 0.000000 0.000000 0.000000 10 0.5 0.000000 -0.400000 -0.000000 2 0-Ypt2ywfCYQ_HEfOZ0r4g
24 0.083333 0.333333 0.166667 0.208333 0.208333 24 0.0 0.041667 0.020833 0.000868 2 0-UxxxWLz1muOzPx2F3-kQ
7 0.000000 0.090909 0.272727 0.272727 0.363636 11 0.0 0.090909 0.090909 0.008264 3 0_G3Cuk88nBAQ1M03g56lA
16 0.090909 0.000000 0.181818 0.363636 0.363636 11 0.0 0.090909 0.000000 0.000000 3 0-5GNzaQOJuMJeVgECShEw
14 0.000000 0.214286 0.214286 0.214286 0.357143 14 -0.5 0.000000 0.142857 0.000000 3 0_nrYU_QKIHQ3zc1w5Rm3w
13 0.200000 0.133333 0.000000 0.266667 0.400000 15 -0.5 0.022222 0.133333 0.002963 3 0_Nf5WBUb3U1j610Yjbg7A
2 0.000000 0.181818 0.181818 0.272727 0.363636 11 0.0 0.045455 0.090909 0.004132 3 0_71XZqxNetnJjJIINQCOg
17 0.071429 0.500000 0.071429 0.285714 0.071429 14 1.0 0.428571 -0.142857 -0.061224 4 0-7mDJd9bOGtmq3FHoZB-Q
19 0.000000 0.100000 0.200000 0.400000 0.300000 10 0.0 0.150000 -0.100000 -0.015000 4 0-BQfUKMcI0dX_OiaGlJjA
4 0.062500 0.156250 0.250000 0.343750 0.187500 32 -0.5 0.093750 -0.156250 -0.014648 4 0_Cfb4ZEG2mQY9C2ACHnbA
15 0.250000 0.083333 0.583333 0.083333 0.000000 12 -0.5 0.166667 -0.500000 -0.083333 4 0_RQdCuZGHAtBIZ5ST8TGw
6 0.200000 0.100000 0.200000 0.300000 0.200000 10 -0.5 0.000000 -0.100000 -0.000000 4 0_FRTEe0XJnPxS4Nsj5tng