import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # this is used for the plot the graph
import seaborn as sns # used for plot interactive graph.
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
%matplotlib inline
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
data= pd.read_csv("./google-play-store-apps/googleplaystore.csv")
data.head(n=3)
App | Category | Rating | Reviews | Size | Installs | Type | Price | Content Rating | Genres | Last Updated | Current Ver | Android Ver | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Photo Editor & Candy Camera & Grid & ScrapBook | ART_AND_DESIGN | 4.1 | 159 | 19M | 10,000+ | Free | 0 | Everyone | Art & Design | January 7, 2018 | 1.0.0 | 4.0.3 and up |
1 | Coloring book moana | ART_AND_DESIGN | 3.9 | 967 | 14M | 500,000+ | Free | 0 | Everyone | Art & Design;Pretend Play | January 15, 2018 | 2.0.0 | 4.0.3 and up |
2 | U Launcher Lite – FREE Live Cool Themes, Hide ... | ART_AND_DESIGN | 4.7 | 87510 | 8.7M | 5,000,000+ | Free | 0 | Everyone | Art & Design | August 1, 2018 | 1.2.4 | 4.0.3 and up |
data['Category'].unique()
array(['ART_AND_DESIGN', 'AUTO_AND_VEHICLES', 'BEAUTY', 'BOOKS_AND_REFERENCE', 'BUSINESS', 'COMICS', 'COMMUNICATION', 'DATING', 'EDUCATION', 'ENTERTAINMENT', 'EVENTS', 'FINANCE', 'FOOD_AND_DRINK', 'HEALTH_AND_FITNESS', 'HOUSE_AND_HOME', 'LIBRARIES_AND_DEMO', 'LIFESTYLE', 'GAME', 'FAMILY', 'MEDICAL', 'SOCIAL', 'SHOPPING', 'PHOTOGRAPHY', 'SPORTS', 'TRAVEL_AND_LOCAL', 'TOOLS', 'PERSONALIZATION', 'PRODUCTIVITY', 'PARENTING', 'WEATHER', 'VIDEO_PLAYERS', 'NEWS_AND_MAGAZINES', 'MAPS_AND_NAVIGATION', '1.9'], dtype=object)
data.isnull().sum()
App 0 Category 0 Rating 1474 Reviews 0 Size 0 Installs 0 Type 1 Price 0 Content Rating 1 Genres 0 Last Updated 0 Current Ver 8 Android Ver 3 dtype: int64
def preprocessong(df):
df.dropna(inplace=True)
data["Last Updated"] = pd.to_datetime(data['Last Updated'])
data['year_added']=data['Last Updated'].dt.year
data['month_added']=data['Last Updated'].dt.month
data['Reviews'] =data['Reviews'].apply(lambda x: int(x))
preprocessong(data)
data.columns
Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver', 'year_added', 'month_added'], dtype='object')
import plotly
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
col = "Type"
grouped = data[col].value_counts().reset_index()
grouped = grouped.rename(columns = {col : "count", "index" : col})
## plot
trace = go.Pie(labels=grouped[col], values=grouped['count'], pull=[0.08, 0])
layout = {'title': 'Target(0 = No, 1 = Yes)'}
fig = go.Figure(data = [trace], layout = layout)
iplot(fig)
d1=data[data['Type']=='Free']
d2=data[data['Type']=='Paid']
col='year_added'
v1=d1[col].value_counts().reset_index()
v1=v1.rename(columns={col:'count','index':col})
v1['percent']=v1['count'].apply(lambda x : 100*x/sum(v1['count']))
v1=v1.sort_values(col)
v2=d2[col].value_counts().reset_index()
v2=v2.rename(columns={col:'count','index':col})
v2['percent']=v2['count'].apply(lambda x : 100*x/sum(v2['count']))
v2=v2.sort_values(col)
trace1 = go.Scatter(x=v1[col], y=v1["count"], name="Free", marker=dict(color="#a678de"))
trace2 = go.Scatter(x=v2[col], y=v2["count"], name="Paid", marker=dict(color="#6ad49b"))
y = [trace1, trace2]
layout={'title':"app udated or added over the years",'xaxis':{'title':"years"}}
fig = go.Figure(data=y, layout=layout)
iplot(fig)
In the above plot, we compare the number of apps that have been updated or added over the course of the year (free vs paid). We can conclude from this plot that there were no paid apps prior to 2011 (after that, Google believes that people have a lot of money, so why not charge for some apps >> just for fun). However, over the course of the year, a large number of free apps have been added in comparison to paid apps - indicating that people dislike paid services.
When comparing apps updated or added between 2011 and 2018, free apps increased from 80% to 96%, while paid apps decreased from 20% to 4%.
col='month_added'
v1=d1[col].value_counts().reset_index()
v1=v1.rename(columns={col:'count','index':col})
v1['percent']=v1['count'].apply(lambda x : 100*x/sum(v1['count']))
v1=v1.sort_values(col)
trace1 = go.Bar(x=v1[col], y=v1["count"], name="Free", marker=dict())
layout={'title':"Free App added over the month",'xaxis':{'title':"months"}}
fig = go.Figure(data=[trace1], layout=layout)
iplot(fig)
col='month_added'
v2=d2[col].value_counts().reset_index()
v2=v2.rename(columns={col:'count','index':col})
v2['percent']=v2['count'].apply(lambda x : 100*x/sum(v2['count']))
v2=v2.sort_values(col)
trace1 = go.Bar(x=v2[col], y=v2["count"], name="aid", marker=dict())
layout={'title':"Paid App added over the month",'xaxis':{'title':"months"}}
fig = go.Figure(data=[trace1], layout=layout)
iplot(fig)
col='Content Rating'
v1=d1[col].value_counts().reset_index()
v1=v1.rename(columns={col:'count','index':col})
v1['percent']=v1['count'].apply(lambda x : 100*x/sum(v1['count']))
v1=v1.sort_values(col)
trace1 = go.Bar(x=v1[col], y=v1["count"], name="Free", marker=dict())
layout={'title':"Free App Content Rating ",'xaxis':{'title':"Contents"}}
fig = go.Figure(data=[trace1], layout=layout)
iplot(fig)
col='Content Rating'
v2=d2[col].value_counts().reset_index()
v2=v2.rename(columns={col:'count','index':col})
v2['percent']=v2['count'].apply(lambda x : 100*x/sum(v2['count']))
v2=v2.sort_values(col)
trace1 = go.Bar(x=v2[col], y=v2["count"], name="aid", marker=dict(color="#6ad49b"))
layout={'title':"Paid App Content Rating",'xaxis':{'title':"contents"}}
fig = go.Figure(data=[trace1], layout=layout)
iplot(fig)
col='Rating'
v1=d1[col].value_counts().reset_index()
v1=v1.rename(columns={col:'count','index':col})
v1['percent']=v1['count'].apply(lambda x : 100*x/sum(v1['count']))
trace1 = go.Bar(x=v1[col], y=v1["count"], name="Free", marker=dict())
layout={'title':"Free App Rating",'xaxis':{'title':"Ratings"}}
fig = go.Figure(data=[trace1], layout=layout)
iplot(fig)
col='Rating'
v2=d2[col].value_counts().reset_index()
v2=v2.rename(columns={col:'count','index':col})
v2['percent']=v2['count'].apply(lambda x : 100*x/sum(v2['count']))
v2=v2.sort_values(col)
trace1 = go.Bar(x=v2[col], y=v2["count"], name="Paid", marker=dict(color="#6ad49b"))
layout={'title':"Paid App Rating",'xaxis':{'title':"Ratingss"}}
fig = go.Figure(data=[trace1], layout=layout)
iplot(fig)
col='Category'
v1=d1[col].value_counts().reset_index()
v1=v1.rename(columns={col:'count','index':col})
v1['percent']=v1['count'].apply(lambda x : 100*x/sum(v1['count']))
v1=v1.sort_values(col)
v2=d2[col].value_counts().reset_index()
v2=v2.rename(columns={col:'count','index':col})
v2['percent']=v2['count'].apply(lambda x : 100*x/sum(v2['count']))
v2=v2.sort_values(col)
trace1 = go.Scatter(x=v1[col], y=v1["count"], name="Free", marker=dict(color="#a678de"))
trace2 = go.Scatter(x=v2[col], y=v2["count"], name="Paid", marker=dict(color="#6ad49b"))
y = [trace1, trace2]
layout={'title':"App Category"}
fig = go.Figure(data=y, layout=layout)
iplot(fig)
col='Android Ver'
v1=d1[col].value_counts().reset_index()
v1=v1.rename(columns={col:'count','index':col})
v1['percent']=v1['count'].apply(lambda x : 100*x/sum(v1['count']))
v1=v1.sort_values(col)
v2=d2[col].value_counts().reset_index()
v2=v2.rename(columns={col:'count','index':col})
v2['percent']=v2['count'].apply(lambda x : 100*x/sum(v2['count']))
v2=v2.sort_values(col)
trace1 = go.Scatter(x=v1[col], y=v1["count"], name="Free", marker=dict(color="#a678de"))
trace2 = go.Scatter(x=v2[col], y=v2["count"], name="Paid", marker=dict(color="#6ad49b"))
y = [trace1, trace2]
layout={'title':"Android Versions"}
fig = go.Figure(data=y, layout=layout)
iplot(fig)
col='Installs'
v1=d1[col].value_counts().reset_index()
v1=v1.rename(columns={col:'count','index':col})
v1['percent']=v1['count'].apply(lambda x : 100*x/sum(v1['count']))
v1=v1.sort_values(col)
v2=d2[col].value_counts().reset_index()
v2=v2.rename(columns={col:'count','index':col})
v2['percent']=v2['count'].apply(lambda x : 100*x/sum(v2['count']))
v2=v2.sort_values(col)
trace1 = go.Scatter(x=v1[col], y=v1["count"], name="Free", marker=dict(color="#a678de"))
trace2 = go.Scatter(x=v2[col], y=v2["count"], name="Paid", marker=dict(color="#6ad49b"))
y = [trace1, trace2]
layout={'title':"Installed App ",'xaxis':{'title':"Installs"}}
fig = go.Figure(data=y, layout=layout)
iplot(fig)
X= pd.read_csv("./google-play-store-apps/googleplaystore.csv")
X.isnull().sum().sum()
1487
total=X.isnull().sum()
percent = (X.isnull().sum()/X.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(13)
Total | Percent | |
---|---|---|
App | 0 | 0.000000 |
Category | 0 | 0.000000 |
Rating | 1474 | 0.135965 |
Reviews | 0 | 0.000000 |
Size | 0 | 0.000000 |
Installs | 0 | 0.000000 |
Type | 1 | 0.000092 |
Price | 0 | 0.000000 |
Content Rating | 1 | 0.000092 |
Genres | 0 | 0.000000 |
Last Updated | 0 | 0.000000 |
Current Ver | 8 | 0.000738 |
Android Ver | 3 | 0.000277 |
X.dropna(inplace=True)
X.head(3)
App | Category | Rating | Reviews | Size | Installs | Type | Price | Content Rating | Genres | Last Updated | Current Ver | Android Ver | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Photo Editor & Candy Camera & Grid & ScrapBook | ART_AND_DESIGN | 4.1 | 159 | 19M | 10,000+ | Free | 0 | Everyone | Art & Design | January 7, 2018 | 1.0.0 | 4.0.3 and up |
1 | Coloring book moana | ART_AND_DESIGN | 3.9 | 967 | 14M | 500,000+ | Free | 0 | Everyone | Art & Design;Pretend Play | January 15, 2018 | 2.0.0 | 4.0.3 and up |
2 | U Launcher Lite – FREE Live Cool Themes, Hide ... | ART_AND_DESIGN | 4.7 | 87510 | 8.7M | 5,000,000+ | Free | 0 | Everyone | Art & Design | August 1, 2018 | 1.2.4 | 4.0.3 and up |
X.dtypes
App object Category object Rating float64 Reviews object Size object Installs object Type object Price object Content Rating object Genres object Last Updated object Current Ver object Android Ver object dtype: object
X['Reviews'] = X['Reviews'].astype('uint8')
# This line creates dummy variables for the 'Category' column of the dataframe 'X',
# with the prefix 'catg', and drops the first column to avoid the dummy variable trap
catgry=pd.get_dummies(X['Category'],prefix='catg',drop_first=True)
# This line creates dummy variables for the 'Type' column of the dataframe 'X',
# with the prefix 'typ', and drops the first column to avoid the dummy variable trap
typ=pd.get_dummies(X['Type'],prefix='typ',drop_first=True)
# This line creates dummy variables for the 'Content Rating' column of the dataframe 'X',
# with the prefix 'cr', and drops the first column to avoid the dummy variable trap
cr=pd.get_dummies(X['Content Rating'],prefix='cr',drop_first=True)
# This line concatenates the original dataframe 'X' with the newly created dummy variables
frames=[X,catgry,typ,cr]
X=pd.concat(frames,axis=1)
# This line drops the original 'Category', 'Installs', 'Type', and 'Content Rating' columns
# from the dataframe 'X', since they have now been replaced with the corresponding dummy variables
X.drop(['Category','Installs','Type','Content Rating'],axis=1,inplace=True)
X.drop(['App','Size','Price','Genres','Last Updated','Current Ver','Android Ver'],axis=1,inplace=True)
X.head(n=3)
Rating | Reviews | catg_AUTO_AND_VEHICLES | catg_BEAUTY | catg_BOOKS_AND_REFERENCE | catg_BUSINESS | catg_COMICS | catg_COMMUNICATION | catg_DATING | catg_EDUCATION | ... | catg_TOOLS | catg_TRAVEL_AND_LOCAL | catg_VIDEO_PLAYERS | catg_WEATHER | typ_Paid | cr_Everyone | cr_Everyone 10+ | cr_Mature 17+ | cr_Teen | cr_Unrated | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4.1 | 159 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
1 | 3.9 | 199 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
2 | 4.7 | 214 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
3 rows × 40 columns
X.columns
Index(['Rating', 'Reviews', 'catg_AUTO_AND_VEHICLES', 'catg_BEAUTY', 'catg_BOOKS_AND_REFERENCE', 'catg_BUSINESS', 'catg_COMICS', 'catg_COMMUNICATION', 'catg_DATING', 'catg_EDUCATION', 'catg_ENTERTAINMENT', 'catg_EVENTS', 'catg_FAMILY', 'catg_FINANCE', 'catg_FOOD_AND_DRINK', 'catg_GAME', 'catg_HEALTH_AND_FITNESS', 'catg_HOUSE_AND_HOME', 'catg_LIBRARIES_AND_DEMO', 'catg_LIFESTYLE', 'catg_MAPS_AND_NAVIGATION', 'catg_MEDICAL', 'catg_NEWS_AND_MAGAZINES', 'catg_PARENTING', 'catg_PERSONALIZATION', 'catg_PHOTOGRAPHY', 'catg_PRODUCTIVITY', 'catg_SHOPPING', 'catg_SOCIAL', 'catg_SPORTS', 'catg_TOOLS', 'catg_TRAVEL_AND_LOCAL', 'catg_VIDEO_PLAYERS', 'catg_WEATHER', 'typ_Paid', 'cr_Everyone', 'cr_Everyone 10+', 'cr_Mature 17+', 'cr_Teen', 'cr_Unrated'], dtype='object')
x= X.drop("Rating", axis=1)
y= data["Rating"].values
y=y.astype('int')
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=0)
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
class CustomTransformer(BaseEstimator, TransformerMixin):
def __init__(self, custom_param):
self.custom_param = custom_param
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
X= X.dropna()
return X+ self.custom_param
# Define the pipeline
# Pipe using SVC()
pipe1 = Pipeline([
("custom_transform", CustomTransformer(5)),
("scaler", StandardScaler()),
("clf", GridSearchCV(
estimator=SVC(),
param_grid={
'C': [0.1, 1, 10],
'kernel': ['rbf', 'linear'],
'gamma': ['scale', 'auto']
},
cv=5,
return_train_score=True
))
])
# fit the pipeline
pipe1.fit(X_train, y_train)
# Print the results for each permutation
results = pipe1['clf'].cv_results_
for mean_train_score, mean_test_score, params in zip(results['mean_train_score'], results['mean_test_score'], results['params']):
print(f"Train score: {mean_train_score:.3f} | Test score: {mean_test_score:.3f} | Params: {params}")
# print the best score and parameters
print(f"Best score: {pipe1['clf'].best_score_}")
print(f"Best params: {pipe1['clf'].best_params_}")
Train score: 0.756 | Test score: 0.756 | Params: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'} Train score: 0.756 | Test score: 0.756 | Params: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'} Train score: 0.756 | Test score: 0.756 | Params: {'C': 0.1, 'gamma': 'auto', 'kernel': 'rbf'} Train score: 0.756 | Test score: 0.756 | Params: {'C': 0.1, 'gamma': 'auto', 'kernel': 'linear'} Train score: 0.756 | Test score: 0.755 | Params: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'} Train score: 0.756 | Test score: 0.756 | Params: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'} Train score: 0.756 | Test score: 0.755 | Params: {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'} Train score: 0.756 | Test score: 0.756 | Params: {'C': 1, 'gamma': 'auto', 'kernel': 'linear'} Train score: 0.758 | Test score: 0.754 | Params: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'} Train score: 0.756 | Test score: 0.756 | Params: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'} Train score: 0.758 | Test score: 0.754 | Params: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'} Train score: 0.756 | Test score: 0.756 | Params: {'C': 10, 'gamma': 'auto', 'kernel': 'linear'} Best score: 0.7556090373894205 Best params: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}
import pickle
svc_model = SVC(C=0.1, kernel='rbf', gamma='scale')
svc_model.fit(X_train, y_train)
with open('svc_model.pkl', 'wb') as file:
pickle.dump(svc_model, file)
import pickle
with open('svc_model.pkl', 'rb') as file:
model = pickle.load(file)
input1= list(X_train.iloc[ 0, : ])
prediction1= model.predict([input1])
print(prediction1)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[4], line 5 3 with open('svc_model.pkl', 'rb') as file: 4 model = pickle.load(file) ----> 5 input1= list(X_train.iloc[ 0, : ]) 6 prediction1= model.predict([input1]) 7 print(prediction1) NameError: name 'X_train' is not defined
y_pred= svc_model.predict(X_test)
input1= list(X_train.iloc[ 0, : ])
print(input1)
[33, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0]
svc_model.predict([input1])
array([4])
from sklearn.metrics import accuracy_score, precision_score,recall_score, classification_report
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
# Error rate
error_rate = 1 - accuracy
print("Error rate: ", error_rate)
# Precision
precision = precision_score(y_test, y_pred, average='macro')
print("Precision: ", precision)
# Recall rate
recall = recall_score(y_test, y_pred, average='macro')
print("Recall rate: ", recall)