Import libraries¶

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # this is used for the plot the graph 
import seaborn as sns # used for plot interactive graph.
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
%matplotlib inline
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
In [2]:
data= pd.read_csv("./google-play-store-apps/googleplaystore.csv")
In [3]:
data.head(n=3)
Out[3]:
App Category Rating Reviews Size Installs Type Price Content Rating Genres Last Updated Current Ver Android Ver
0 Photo Editor & Candy Camera & Grid & ScrapBook ART_AND_DESIGN 4.1 159 19M 10,000+ Free 0 Everyone Art & Design January 7, 2018 1.0.0 4.0.3 and up
1 Coloring book moana ART_AND_DESIGN 3.9 967 14M 500,000+ Free 0 Everyone Art & Design;Pretend Play January 15, 2018 2.0.0 4.0.3 and up
2 U Launcher Lite – FREE Live Cool Themes, Hide ... ART_AND_DESIGN 4.7 87510 8.7M 5,000,000+ Free 0 Everyone Art & Design August 1, 2018 1.2.4 4.0.3 and up

Check for null values¶

In [4]:
data['Category'].unique()
Out[4]:
array(['ART_AND_DESIGN', 'AUTO_AND_VEHICLES', 'BEAUTY',
       'BOOKS_AND_REFERENCE', 'BUSINESS', 'COMICS', 'COMMUNICATION',
       'DATING', 'EDUCATION', 'ENTERTAINMENT', 'EVENTS', 'FINANCE',
       'FOOD_AND_DRINK', 'HEALTH_AND_FITNESS', 'HOUSE_AND_HOME',
       'LIBRARIES_AND_DEMO', 'LIFESTYLE', 'GAME', 'FAMILY', 'MEDICAL',
       'SOCIAL', 'SHOPPING', 'PHOTOGRAPHY', 'SPORTS', 'TRAVEL_AND_LOCAL',
       'TOOLS', 'PERSONALIZATION', 'PRODUCTIVITY', 'PARENTING', 'WEATHER',
       'VIDEO_PLAYERS', 'NEWS_AND_MAGAZINES', 'MAPS_AND_NAVIGATION',
       '1.9'], dtype=object)
In [5]:
data.isnull().sum()
Out[5]:
App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

Preprocessing¶

  • Remove the null values
  • Remove the rows from the dataset where rating is greater than 5
  • Adding 2 columns in the dataset by spliting them with last updates columsn, by doing this we find that in which year apps are added or updated on playstore.
In [6]:
def preprocessong(df):
    df.dropna(inplace=True)
    data["Last Updated"] = pd.to_datetime(data['Last Updated'])
    data['year_added']=data['Last Updated'].dt.year
    data['month_added']=data['Last Updated'].dt.month
    data['Reviews'] =data['Reviews'].apply(lambda x: int(x))
In [7]:
preprocessong(data)
In [8]:
data.columns
Out[8]:
Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver', 'year_added', 'month_added'],
      dtype='object')

Data Visualization¶

In [9]:
import plotly
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
col = "Type"
grouped = data[col].value_counts().reset_index()
grouped = grouped.rename(columns = {col : "count", "index" : col})

## plot
trace = go.Pie(labels=grouped[col], values=grouped['count'], pull=[0.08, 0])
layout = {'title': 'Target(0 = No, 1 = Yes)'}
fig = go.Figure(data = [trace], layout = layout)
iplot(fig)

Here we see that 92.6% apps are freee and 7.38% apps are paid on google playstore. so we say that Most of the people love free services¶

In [10]:
d1=data[data['Type']=='Free']
d2=data[data['Type']=='Paid']
In [11]:
col='year_added'
v1=d1[col].value_counts().reset_index()
v1=v1.rename(columns={col:'count','index':col})
v1['percent']=v1['count'].apply(lambda x : 100*x/sum(v1['count']))
v1=v1.sort_values(col)
v2=d2[col].value_counts().reset_index()
v2=v2.rename(columns={col:'count','index':col})
v2['percent']=v2['count'].apply(lambda x : 100*x/sum(v2['count']))
v2=v2.sort_values(col)
trace1 = go.Scatter(x=v1[col], y=v1["count"], name="Free", marker=dict(color="#a678de"))
trace2 = go.Scatter(x=v2[col], y=v2["count"], name="Paid", marker=dict(color="#6ad49b"))
y = [trace1, trace2]
layout={'title':"app udated or added over the years",'xaxis':{'title':"years"}}
fig = go.Figure(data=y, layout=layout)
iplot(fig)
  • In the above plot, we compare the number of apps that have been updated or added over the course of the year (free vs paid). We can conclude from this plot that there were no paid apps prior to 2011 (after that, Google believes that people have a lot of money, so why not charge for some apps >> just for fun). However, over the course of the year, a large number of free apps have been added in comparison to paid apps - indicating that people dislike paid services.

  • When comparing apps updated or added between 2011 and 2018, free apps increased from 80% to 96%, while paid apps decreased from 20% to 4%.

In [12]:
col='month_added'
v1=d1[col].value_counts().reset_index()
v1=v1.rename(columns={col:'count','index':col})
v1['percent']=v1['count'].apply(lambda x : 100*x/sum(v1['count']))
v1=v1.sort_values(col)
trace1 = go.Bar(x=v1[col], y=v1["count"], name="Free", marker=dict())
layout={'title':"Free App added over the month",'xaxis':{'title':"months"}}
fig = go.Figure(data=[trace1], layout=layout)
iplot(fig)
  • This Google Playstore data Almost half of the apps available on the Play Store are added or updated in July, 25% in August, and the remaining 25% in the remaining months.
In [13]:
col='month_added'
v2=d2[col].value_counts().reset_index()
v2=v2.rename(columns={col:'count','index':col})
v2['percent']=v2['count'].apply(lambda x : 100*x/sum(v2['count']))
v2=v2.sort_values(col)
trace1 = go.Bar(x=v2[col], y=v2["count"], name="aid", marker=dict())
layout={'title':"Paid App added over the month",'xaxis':{'title':"months"}}
fig = go.Figure(data=[trace1], layout=layout)
iplot(fig)
In [14]:
col='Content Rating'
v1=d1[col].value_counts().reset_index()
v1=v1.rename(columns={col:'count','index':col})
v1['percent']=v1['count'].apply(lambda x : 100*x/sum(v1['count']))
v1=v1.sort_values(col)
trace1 = go.Bar(x=v1[col], y=v1["count"], name="Free", marker=dict())
layout={'title':"Free App Content Rating ",'xaxis':{'title':"Contents"}}
fig = go.Figure(data=[trace1], layout=layout)
iplot(fig)
In [15]:
col='Content Rating'
v2=d2[col].value_counts().reset_index()
v2=v2.rename(columns={col:'count','index':col})
v2['percent']=v2['count'].apply(lambda x : 100*x/sum(v2['count']))
v2=v2.sort_values(col)
trace1 = go.Bar(x=v2[col], y=v2["count"], name="aid",  marker=dict(color="#6ad49b"))
layout={'title':"Paid App Content Rating",'xaxis':{'title':"contents"}}
fig = go.Figure(data=[trace1], layout=layout)
iplot(fig)
In [16]:
col='Rating'
v1=d1[col].value_counts().reset_index()
v1=v1.rename(columns={col:'count','index':col})
v1['percent']=v1['count'].apply(lambda x : 100*x/sum(v1['count']))
trace1 = go.Bar(x=v1[col], y=v1["count"], name="Free", marker=dict())
layout={'title':"Free App Rating",'xaxis':{'title':"Ratings"}}
fig = go.Figure(data=[trace1], layout=layout)
iplot(fig)
In [17]:
col='Rating'
v2=d2[col].value_counts().reset_index()
v2=v2.rename(columns={col:'count','index':col})
v2['percent']=v2['count'].apply(lambda x : 100*x/sum(v2['count']))
v2=v2.sort_values(col)
trace1 = go.Bar(x=v2[col], y=v2["count"], name="Paid",  marker=dict(color="#6ad49b"))
layout={'title':"Paid App Rating",'xaxis':{'title':"Ratingss"}}
fig = go.Figure(data=[trace1], layout=layout)
iplot(fig)
In [18]:
col='Category'
v1=d1[col].value_counts().reset_index()
v1=v1.rename(columns={col:'count','index':col})
v1['percent']=v1['count'].apply(lambda x : 100*x/sum(v1['count']))
v1=v1.sort_values(col)
v2=d2[col].value_counts().reset_index()
v2=v2.rename(columns={col:'count','index':col})
v2['percent']=v2['count'].apply(lambda x : 100*x/sum(v2['count']))
v2=v2.sort_values(col)
trace1 = go.Scatter(x=v1[col], y=v1["count"], name="Free", marker=dict(color="#a678de"))
trace2 = go.Scatter(x=v2[col], y=v2["count"], name="Paid", marker=dict(color="#6ad49b"))
y = [trace1, trace2]
layout={'title':"App Category"}
fig = go.Figure(data=y, layout=layout)
iplot(fig)
  • The majority of the apps fall into the Family Game and Tools categories.
  • The second most popular app category these days is games for kids!!!
In [19]:
col='Android Ver'
v1=d1[col].value_counts().reset_index()
v1=v1.rename(columns={col:'count','index':col})
v1['percent']=v1['count'].apply(lambda x : 100*x/sum(v1['count']))
v1=v1.sort_values(col)
v2=d2[col].value_counts().reset_index()
v2=v2.rename(columns={col:'count','index':col})
v2['percent']=v2['count'].apply(lambda x : 100*x/sum(v2['count']))
v2=v2.sort_values(col)
trace1 = go.Scatter(x=v1[col], y=v1["count"], name="Free", marker=dict(color="#a678de"))
trace2 = go.Scatter(x=v2[col], y=v2["count"], name="Paid", marker=dict(color="#6ad49b"))
y = [trace1, trace2]
layout={'title':"Android Versions"}
fig = go.Figure(data=y, layout=layout)
iplot(fig)
In [20]:
col='Installs'
v1=d1[col].value_counts().reset_index()
v1=v1.rename(columns={col:'count','index':col})
v1['percent']=v1['count'].apply(lambda x : 100*x/sum(v1['count']))
v1=v1.sort_values(col)
v2=d2[col].value_counts().reset_index()
v2=v2.rename(columns={col:'count','index':col})
v2['percent']=v2['count'].apply(lambda x : 100*x/sum(v2['count']))
v2=v2.sort_values(col)
trace1 = go.Scatter(x=v1[col], y=v1["count"], name="Free", marker=dict(color="#a678de"))
trace2 = go.Scatter(x=v2[col], y=v2["count"], name="Paid", marker=dict(color="#6ad49b"))
y = [trace1, trace2]
layout={'title':"Installed App ",'xaxis':{'title':"Installs"}}
fig = go.Figure(data=y, layout=layout)
iplot(fig)
  • According to the above plot, the most apps have 1M installs rather than 10M or 100k, and only a few have 500M or 1B installs. Some apps, such as Instagram, Youtube, Facebook WhatsApp, and others, have exceeded the 1 billion installs mark.

Data cleaning¶

In [21]:
X= pd.read_csv("./google-play-store-apps/googleplaystore.csv")
In [22]:
X.isnull().sum().sum()
Out[22]:
1487
In [23]:
total=X.isnull().sum()
percent = (X.isnull().sum()/X.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(13)
Out[23]:
Total Percent
App 0 0.000000
Category 0 0.000000
Rating 1474 0.135965
Reviews 0 0.000000
Size 0 0.000000
Installs 0 0.000000
Type 1 0.000092
Price 0 0.000000
Content Rating 1 0.000092
Genres 0 0.000000
Last Updated 0 0.000000
Current Ver 8 0.000738
Android Ver 3 0.000277
In [24]:
X.dropna(inplace=True)

Creating Dummy Variables¶

In [25]:
X.head(3)
Out[25]:
App Category Rating Reviews Size Installs Type Price Content Rating Genres Last Updated Current Ver Android Ver
0 Photo Editor & Candy Camera & Grid & ScrapBook ART_AND_DESIGN 4.1 159 19M 10,000+ Free 0 Everyone Art & Design January 7, 2018 1.0.0 4.0.3 and up
1 Coloring book moana ART_AND_DESIGN 3.9 967 14M 500,000+ Free 0 Everyone Art & Design;Pretend Play January 15, 2018 2.0.0 4.0.3 and up
2 U Launcher Lite – FREE Live Cool Themes, Hide ... ART_AND_DESIGN 4.7 87510 8.7M 5,000,000+ Free 0 Everyone Art & Design August 1, 2018 1.2.4 4.0.3 and up
In [26]:
X.dtypes
Out[26]:
App                object
Category           object
Rating            float64
Reviews            object
Size               object
Installs           object
Type               object
Price              object
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object
In [27]:
X['Reviews'] = X['Reviews'].astype('uint8')
In [28]:
# This line creates dummy variables for the 'Category' column of the dataframe 'X',
# with the prefix 'catg', and drops the first column to avoid the dummy variable trap
catgry=pd.get_dummies(X['Category'],prefix='catg',drop_first=True)

# This line creates dummy variables for the 'Type' column of the dataframe 'X',
# with the prefix 'typ', and drops the first column to avoid the dummy variable trap
typ=pd.get_dummies(X['Type'],prefix='typ',drop_first=True)

# This line creates dummy variables for the 'Content Rating' column of the dataframe 'X',
# with the prefix 'cr', and drops the first column to avoid the dummy variable trap
cr=pd.get_dummies(X['Content Rating'],prefix='cr',drop_first=True)

# This line concatenates the original dataframe 'X' with the newly created dummy variables
frames=[X,catgry,typ,cr]
X=pd.concat(frames,axis=1)

# This line drops the original 'Category', 'Installs', 'Type', and 'Content Rating' columns
# from the dataframe 'X', since they have now been replaced with the corresponding dummy variables
X.drop(['Category','Installs','Type','Content Rating'],axis=1,inplace=True)
In [29]:
X.drop(['App','Size','Price','Genres','Last Updated','Current Ver','Android Ver'],axis=1,inplace=True)
In [30]:
X.head(n=3)
Out[30]:
Rating Reviews catg_AUTO_AND_VEHICLES catg_BEAUTY catg_BOOKS_AND_REFERENCE catg_BUSINESS catg_COMICS catg_COMMUNICATION catg_DATING catg_EDUCATION ... catg_TOOLS catg_TRAVEL_AND_LOCAL catg_VIDEO_PLAYERS catg_WEATHER typ_Paid cr_Everyone cr_Everyone 10+ cr_Mature 17+ cr_Teen cr_Unrated
0 4.1 159 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 1 0 0 0 0
1 3.9 199 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 1 0 0 0 0
2 4.7 214 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 1 0 0 0 0

3 rows × 40 columns

In [31]:
X.columns
Out[31]:
Index(['Rating', 'Reviews', 'catg_AUTO_AND_VEHICLES', 'catg_BEAUTY',
       'catg_BOOKS_AND_REFERENCE', 'catg_BUSINESS', 'catg_COMICS',
       'catg_COMMUNICATION', 'catg_DATING', 'catg_EDUCATION',
       'catg_ENTERTAINMENT', 'catg_EVENTS', 'catg_FAMILY', 'catg_FINANCE',
       'catg_FOOD_AND_DRINK', 'catg_GAME', 'catg_HEALTH_AND_FITNESS',
       'catg_HOUSE_AND_HOME', 'catg_LIBRARIES_AND_DEMO', 'catg_LIFESTYLE',
       'catg_MAPS_AND_NAVIGATION', 'catg_MEDICAL', 'catg_NEWS_AND_MAGAZINES',
       'catg_PARENTING', 'catg_PERSONALIZATION', 'catg_PHOTOGRAPHY',
       'catg_PRODUCTIVITY', 'catg_SHOPPING', 'catg_SOCIAL', 'catg_SPORTS',
       'catg_TOOLS', 'catg_TRAVEL_AND_LOCAL', 'catg_VIDEO_PLAYERS',
       'catg_WEATHER', 'typ_Paid', 'cr_Everyone', 'cr_Everyone 10+',
       'cr_Mature 17+', 'cr_Teen', 'cr_Unrated'],
      dtype='object')
In [32]:
x= X.drop("Rating", axis=1)
y= data["Rating"].values
y=y.astype('int')
In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=0)
In [42]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, custom_param):
        self.custom_param = custom_param
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X= X.dropna()


        return X+ self.custom_param

Modelling¶

In [35]:
# Define the pipeline
# Pipe using SVC()
pipe1 = Pipeline([
    ("custom_transform", CustomTransformer(5)),
    ("scaler", StandardScaler()),
    ("clf", GridSearchCV(
        estimator=SVC(),
        param_grid={
            'C': [0.1, 1, 10],
            'kernel': ['rbf', 'linear'],
            'gamma': ['scale', 'auto']
        },
        cv=5,
        return_train_score=True
    ))
])

# fit the pipeline
pipe1.fit(X_train, y_train)



# Print the results for each permutation
results = pipe1['clf'].cv_results_
for mean_train_score, mean_test_score, params in zip(results['mean_train_score'], results['mean_test_score'], results['params']):
    print(f"Train score: {mean_train_score:.3f} | Test score: {mean_test_score:.3f} | Params: {params}")

# print the best score and parameters
print(f"Best score: {pipe1['clf'].best_score_}")
print(f"Best params: {pipe1['clf'].best_params_}")
Train score: 0.756 | Test score: 0.756 | Params: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}
Train score: 0.756 | Test score: 0.756 | Params: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Train score: 0.756 | Test score: 0.756 | Params: {'C': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}
Train score: 0.756 | Test score: 0.756 | Params: {'C': 0.1, 'gamma': 'auto', 'kernel': 'linear'}
Train score: 0.756 | Test score: 0.755 | Params: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Train score: 0.756 | Test score: 0.756 | Params: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Train score: 0.756 | Test score: 0.755 | Params: {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}
Train score: 0.756 | Test score: 0.756 | Params: {'C': 1, 'gamma': 'auto', 'kernel': 'linear'}
Train score: 0.758 | Test score: 0.754 | Params: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Train score: 0.756 | Test score: 0.756 | Params: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
Train score: 0.758 | Test score: 0.754 | Params: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}
Train score: 0.756 | Test score: 0.756 | Params: {'C': 10, 'gamma': 'auto', 'kernel': 'linear'}
Best score: 0.7556090373894205
Best params: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}
In [46]:
import pickle
svc_model = SVC(C=0.1, kernel='rbf', gamma='scale')

svc_model.fit(X_train, y_train)

with open('svc_model.pkl', 'wb') as file:
    pickle.dump(svc_model, file)
In [4]:
import pickle

with open('svc_model.pkl', 'rb') as file:
    model = pickle.load(file)
input1= list(X_train.iloc[ 0, : ])
prediction1= model.predict([input1])
print(prediction1)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[4], line 5
      3 with open('svc_model.pkl', 'rb') as file:
      4     model = pickle.load(file)
----> 5 input1= list(X_train.iloc[ 0, : ])
      6 prediction1= model.predict([input1])
      7 print(prediction1)

NameError: name 'X_train' is not defined
In [55]:
y_pred= svc_model.predict(X_test)
In [51]:
input1= list(X_train.iloc[ 0, : ])
print(input1)
[33, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0]
In [52]:
svc_model.predict([input1])
Out[52]:
array([4])
In [ ]:
from sklearn.metrics import accuracy_score, precision_score,recall_score, classification_report
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
# Error rate
error_rate = 1 - accuracy
print("Error rate: ", error_rate)
# Precision
precision = precision_score(y_test, y_pred, average='macro')
print("Precision: ", precision)
# Recall rate
recall = recall_score(y_test, y_pred, average='macro')
print("Recall rate: ", recall)
  • Accuracy: 0.7644230769230769
  • Error rate: 0.23557692307692313
  • Precision: 0.15288461538461537
  • Recall rate: 0.2
In [ ]: