import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # this is used for the plot the graph 
import seaborn as sns # used for plot interactive graph.
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
%matplotlib inline
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')


data= pd.read_csv("./google-play-store-apps/googleplaystore.csv")


data.head(n=3)


data['Category'].unique()

array(['ART_AND_DESIGN', 'AUTO_AND_VEHICLES', 'BEAUTY',
       'BOOKS_AND_REFERENCE', 'BUSINESS', 'COMICS', 'COMMUNICATION',
       'DATING', 'EDUCATION', 'ENTERTAINMENT', 'EVENTS', 'FINANCE',
       'FOOD_AND_DRINK', 'HEALTH_AND_FITNESS', 'HOUSE_AND_HOME',
       'LIBRARIES_AND_DEMO', 'LIFESTYLE', 'GAME', 'FAMILY', 'MEDICAL',
       'SOCIAL', 'SHOPPING', 'PHOTOGRAPHY', 'SPORTS', 'TRAVEL_AND_LOCAL',
       'TOOLS', 'PERSONALIZATION', 'PRODUCTIVITY', 'PARENTING', 'WEATHER',
       'VIDEO_PLAYERS', 'NEWS_AND_MAGAZINES', 'MAPS_AND_NAVIGATION',
       '1.9'], dtype=object)


data.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64


def preprocessong(df):
    df.dropna(inplace=True)
    data["Last Updated"] = pd.to_datetime(data['Last Updated'])
    data['year_added']=data['Last Updated'].dt.year
    data['month_added']=data['Last Updated'].dt.month
    data['Reviews'] =data['Reviews'].apply(lambda x: int(x))


preprocessong(data)


data.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver', 'year_added', 'month_added'],
      dtype='object')


import plotly
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
col = "Type"
grouped = data[col].value_counts().reset_index()
grouped = grouped.rename(columns = {col : "count", "index" : col})

## plot
trace = go.Pie(labels=grouped[col], values=grouped['count'], pull=[0.08, 0])
layout = {'title': 'Target(0 = No, 1 = Yes)'}
fig = go.Figure(data = [trace], layout = layout)
iplot(fig)


d1=data[data['Type']=='Free']
d2=data[data['Type']=='Paid']


col='year_added'
v1=d1[col].value_counts().reset_index()
v1=v1.rename(columns={col:'count','index':col})
v1['percent']=v1['count'].apply(lambda x : 100*x/sum(v1['count']))
v1=v1.sort_values(col)
v2=d2[col].value_counts().reset_index()
v2=v2.rename(columns={col:'count','index':col})
v2['percent']=v2['count'].apply(lambda x : 100*x/sum(v2['count']))
v2=v2.sort_values(col)
trace1 = go.Scatter(x=v1[col], y=v1["count"], name="Free", marker=dict(color="#a678de"))
trace2 = go.Scatter(x=v2[col], y=v2["count"], name="Paid", marker=dict(color="#6ad49b"))
y = [trace1, trace2]
layout={'title':"app udated or added over the years",'xaxis':{'title':"years"}}
fig = go.Figure(data=y, layout=layout)
iplot(fig)


col='month_added'
v1=d1[col].value_counts().reset_index()
v1=v1.rename(columns={col:'count','index':col})
v1['percent']=v1['count'].apply(lambda x : 100*x/sum(v1['count']))
v1=v1.sort_values(col)
trace1 = go.Bar(x=v1[col], y=v1["count"], name="Free", marker=dict())
layout={'title':"Free App added over the month",'xaxis':{'title':"months"}}
fig = go.Figure(data=[trace1], layout=layout)
iplot(fig)


col='month_added'
v2=d2[col].value_counts().reset_index()
v2=v2.rename(columns={col:'count','index':col})
v2['percent']=v2['count'].apply(lambda x : 100*x/sum(v2['count']))
v2=v2.sort_values(col)
trace1 = go.Bar(x=v2[col], y=v2["count"], name="aid", marker=dict())
layout={'title':"Paid App added over the month",'xaxis':{'title':"months"}}
fig = go.Figure(data=[trace1], layout=layout)
iplot(fig)


col='Content Rating'
v1=d1[col].value_counts().reset_index()
v1=v1.rename(columns={col:'count','index':col})
v1['percent']=v1['count'].apply(lambda x : 100*x/sum(v1['count']))
v1=v1.sort_values(col)
trace1 = go.Bar(x=v1[col], y=v1["count"], name="Free", marker=dict())
layout={'title':"Free App Content Rating ",'xaxis':{'title':"Contents"}}
fig = go.Figure(data=[trace1], layout=layout)
iplot(fig)


col='Content Rating'
v2=d2[col].value_counts().reset_index()
v2=v2.rename(columns={col:'count','index':col})
v2['percent']=v2['count'].apply(lambda x : 100*x/sum(v2['count']))
v2=v2.sort_values(col)
trace1 = go.Bar(x=v2[col], y=v2["count"], name="aid",  marker=dict(color="#6ad49b"))
layout={'title':"Paid App Content Rating",'xaxis':{'title':"contents"}}
fig = go.Figure(data=[trace1], layout=layout)
iplot(fig)


col='Rating'
v1=d1[col].value_counts().reset_index()
v1=v1.rename(columns={col:'count','index':col})
v1['percent']=v1['count'].apply(lambda x : 100*x/sum(v1['count']))
trace1 = go.Bar(x=v1[col], y=v1["count"], name="Free", marker=dict())
layout={'title':"Free App Rating",'xaxis':{'title':"Ratings"}}
fig = go.Figure(data=[trace1], layout=layout)
iplot(fig)


col='Rating'
v2=d2[col].value_counts().reset_index()
v2=v2.rename(columns={col:'count','index':col})
v2['percent']=v2['count'].apply(lambda x : 100*x/sum(v2['count']))
v2=v2.sort_values(col)
trace1 = go.Bar(x=v2[col], y=v2["count"], name="Paid",  marker=dict(color="#6ad49b"))
layout={'title':"Paid App Rating",'xaxis':{'title':"Ratingss"}}
fig = go.Figure(data=[trace1], layout=layout)
iplot(fig)


col='Category'
v1=d1[col].value_counts().reset_index()
v1=v1.rename(columns={col:'count','index':col})
v1['percent']=v1['count'].apply(lambda x : 100*x/sum(v1['count']))
v1=v1.sort_values(col)
v2=d2[col].value_counts().reset_index()
v2=v2.rename(columns={col:'count','index':col})
v2['percent']=v2['count'].apply(lambda x : 100*x/sum(v2['count']))
v2=v2.sort_values(col)
trace1 = go.Scatter(x=v1[col], y=v1["count"], name="Free", marker=dict(color="#a678de"))
trace2 = go.Scatter(x=v2[col], y=v2["count"], name="Paid", marker=dict(color="#6ad49b"))
y = [trace1, trace2]
layout={'title':"App Category"}
fig = go.Figure(data=y, layout=layout)
iplot(fig)


col='Android Ver'
v1=d1[col].value_counts().reset_index()
v1=v1.rename(columns={col:'count','index':col})
v1['percent']=v1['count'].apply(lambda x : 100*x/sum(v1['count']))
v1=v1.sort_values(col)
v2=d2[col].value_counts().reset_index()
v2=v2.rename(columns={col:'count','index':col})
v2['percent']=v2['count'].apply(lambda x : 100*x/sum(v2['count']))
v2=v2.sort_values(col)
trace1 = go.Scatter(x=v1[col], y=v1["count"], name="Free", marker=dict(color="#a678de"))
trace2 = go.Scatter(x=v2[col], y=v2["count"], name="Paid", marker=dict(color="#6ad49b"))
y = [trace1, trace2]
layout={'title':"Android Versions"}
fig = go.Figure(data=y, layout=layout)
iplot(fig)


col='Installs'
v1=d1[col].value_counts().reset_index()
v1=v1.rename(columns={col:'count','index':col})
v1['percent']=v1['count'].apply(lambda x : 100*x/sum(v1['count']))
v1=v1.sort_values(col)
v2=d2[col].value_counts().reset_index()
v2=v2.rename(columns={col:'count','index':col})
v2['percent']=v2['count'].apply(lambda x : 100*x/sum(v2['count']))
v2=v2.sort_values(col)
trace1 = go.Scatter(x=v1[col], y=v1["count"], name="Free", marker=dict(color="#a678de"))
trace2 = go.Scatter(x=v2[col], y=v2["count"], name="Paid", marker=dict(color="#6ad49b"))
y = [trace1, trace2]
layout={'title':"Installed App ",'xaxis':{'title':"Installs"}}
fig = go.Figure(data=y, layout=layout)
iplot(fig)


X= pd.read_csv("./google-play-store-apps/googleplaystore.csv")


X.isnull().sum().sum()

1487


total=X.isnull().sum()
percent = (X.isnull().sum()/X.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(13)


X.dropna(inplace=True)


X.head(3)


X.dtypes

App                object
Category           object
Rating            float64
Reviews            object
Size               object
Installs           object
Type               object
Price              object
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object


X['Reviews'] = X['Reviews'].astype('uint8')


# This line creates dummy variables for the 'Category' column of the dataframe 'X',
# with the prefix 'catg', and drops the first column to avoid the dummy variable trap
catgry=pd.get_dummies(X['Category'],prefix='catg',drop_first=True)

# This line creates dummy variables for the 'Type' column of the dataframe 'X',
# with the prefix 'typ', and drops the first column to avoid the dummy variable trap
typ=pd.get_dummies(X['Type'],prefix='typ',drop_first=True)

# This line creates dummy variables for the 'Content Rating' column of the dataframe 'X',
# with the prefix 'cr', and drops the first column to avoid the dummy variable trap
cr=pd.get_dummies(X['Content Rating'],prefix='cr',drop_first=True)

# This line concatenates the original dataframe 'X' with the newly created dummy variables
frames=[X,catgry,typ,cr]
X=pd.concat(frames,axis=1)

# This line drops the original 'Category', 'Installs', 'Type', and 'Content Rating' columns
# from the dataframe 'X', since they have now been replaced with the corresponding dummy variables
X.drop(['Category','Installs','Type','Content Rating'],axis=1,inplace=True)


X.drop(['App','Size','Price','Genres','Last Updated','Current Ver','Android Ver'],axis=1,inplace=True)


X.head(n=3)


X.columns

Index(['Rating', 'Reviews', 'catg_AUTO_AND_VEHICLES', 'catg_BEAUTY',
       'catg_BOOKS_AND_REFERENCE', 'catg_BUSINESS', 'catg_COMICS',
       'catg_COMMUNICATION', 'catg_DATING', 'catg_EDUCATION',
       'catg_ENTERTAINMENT', 'catg_EVENTS', 'catg_FAMILY', 'catg_FINANCE',
       'catg_FOOD_AND_DRINK', 'catg_GAME', 'catg_HEALTH_AND_FITNESS',
       'catg_HOUSE_AND_HOME', 'catg_LIBRARIES_AND_DEMO', 'catg_LIFESTYLE',
       'catg_MAPS_AND_NAVIGATION', 'catg_MEDICAL', 'catg_NEWS_AND_MAGAZINES',
       'catg_PARENTING', 'catg_PERSONALIZATION', 'catg_PHOTOGRAPHY',
       'catg_PRODUCTIVITY', 'catg_SHOPPING', 'catg_SOCIAL', 'catg_SPORTS',
       'catg_TOOLS', 'catg_TRAVEL_AND_LOCAL', 'catg_VIDEO_PLAYERS',
       'catg_WEATHER', 'typ_Paid', 'cr_Everyone', 'cr_Everyone 10+',
       'cr_Mature 17+', 'cr_Teen', 'cr_Unrated'],
      dtype='object')


x= X.drop("Rating", axis=1)
y= data["Rating"].values
y=y.astype('int')


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=0)


from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, custom_param):
        self.custom_param = custom_param
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X= X.dropna()


        return X+ self.custom_param


# Define the pipeline
# Pipe using SVC()
pipe1 = Pipeline([
    ("custom_transform", CustomTransformer(5)),
    ("scaler", StandardScaler()),
    ("clf", GridSearchCV(
        estimator=SVC(),
        param_grid={
            'C': [0.1, 1, 10],
            'kernel': ['rbf', 'linear'],
            'gamma': ['scale', 'auto']
        },
        cv=5,
        return_train_score=True
    ))
])

# fit the pipeline
pipe1.fit(X_train, y_train)



# Print the results for each permutation
results = pipe1['clf'].cv_results_
for mean_train_score, mean_test_score, params in zip(results['mean_train_score'], results['mean_test_score'], results['params']):
    print(f"Train score: {mean_train_score:.3f} | Test score: {mean_test_score:.3f} | Params: {params}")

# print the best score and parameters
print(f"Best score: {pipe1['clf'].best_score_}")
print(f"Best params: {pipe1['clf'].best_params_}")

Train score: 0.756 | Test score: 0.756 | Params: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}
Train score: 0.756 | Test score: 0.756 | Params: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Train score: 0.756 | Test score: 0.756 | Params: {'C': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}
Train score: 0.756 | Test score: 0.756 | Params: {'C': 0.1, 'gamma': 'auto', 'kernel': 'linear'}
Train score: 0.756 | Test score: 0.755 | Params: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Train score: 0.756 | Test score: 0.756 | Params: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Train score: 0.756 | Test score: 0.755 | Params: {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}
Train score: 0.756 | Test score: 0.756 | Params: {'C': 1, 'gamma': 'auto', 'kernel': 'linear'}
Train score: 0.758 | Test score: 0.754 | Params: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Train score: 0.756 | Test score: 0.756 | Params: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
Train score: 0.758 | Test score: 0.754 | Params: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}
Train score: 0.756 | Test score: 0.756 | Params: {'C': 10, 'gamma': 'auto', 'kernel': 'linear'}
Best score: 0.7556090373894205
Best params: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}


import pickle
svc_model = SVC(C=0.1, kernel='rbf', gamma='scale')

svc_model.fit(X_train, y_train)

with open('svc_model.pkl', 'wb') as file:
    pickle.dump(svc_model, file)


import pickle

with open('svc_model.pkl', 'rb') as file:
    model = pickle.load(file)
input1= list(X_train.iloc[ 0, : ])
prediction1= model.predict([input1])
print(prediction1)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[4], line 5
      3 with open('svc_model.pkl', 'rb') as file:
      4     model = pickle.load(file)
----> 5 input1= list(X_train.iloc[ 0, : ])
      6 prediction1= model.predict([input1])
      7 print(prediction1)

NameError: name 'X_train' is not defined


y_pred= svc_model.predict(X_test)


input1= list(X_train.iloc[ 0, : ])
print(input1)

[33, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0]


svc_model.predict([input1])

array([4])


from sklearn.metrics import accuracy_score, precision_score,recall_score, classification_report
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
# Error rate
error_rate = 1 - accuracy
print("Error rate: ", error_rate)
# Precision
precision = precision_score(y_test, y_pred, average='macro')
print("Precision: ", precision)
# Recall rate
recall = recall_score(y_test, y_pred, average='macro')
print("Recall rate: ", recall)

	App	Category	Rating	Reviews	Size	Installs	Type	Content Rating	Genres	Last Updated	Current Ver	Android Ver
0	Photo Editor & Candy Camera & Grid & ScrapBook	ART_AND_DESIGN	4.1	159	19M	10,000+	Free	Everyone	Art & Design	January 7, 2018	1.0.0	4.0.3 and up
1	Coloring book moana	ART_AND_DESIGN	3.9	967	14M	500,000+	Free	Everyone	Art & Design;Pretend Play	January 15, 2018	2.0.0	4.0.3 and up
2	U Launcher Lite – FREE Live Cool Themes, Hide ...	ART_AND_DESIGN	4.7	87510	8.7M	5,000,000+	Free	Everyone	Art & Design	August 1, 2018	1.2.4	4.0.3 and up

	App	Category	Rating	Reviews	Size	Installs	Type	Content Rating	Genres	Last Updated	Current Ver	Android Ver
0	Photo Editor & Candy Camera & Grid & ScrapBook	ART_AND_DESIGN	4.1	159	19M	10,000+	Free	Everyone	Art & Design	January 7, 2018	1.0.0	4.0.3 and up
1	Coloring book moana	ART_AND_DESIGN	3.9	967	14M	500,000+	Free	Everyone	Art & Design;Pretend Play	January 15, 2018	2.0.0	4.0.3 and up
2	U Launcher Lite – FREE Live Cool Themes, Hide ...	ART_AND_DESIGN	4.7	87510	8.7M	5,000,000+	Free	Everyone	Art & Design	August 1, 2018	1.2.4	4.0.3 and up

Import libraries¶

Check for null values¶

Preprocessing¶

Data Visualization¶

Here we see that 92.6% apps are freee and 7.38% apps are paid on google playstore. so we say that Most of the people love free services¶

Data cleaning¶

Creating Dummy Variables¶

Modelling¶

	Total	Percent
App	0	0.000000
Category	0	0.000000
Rating	1474	0.135965
Reviews	0	0.000000
Size	0	0.000000
Installs	0	0.000000
Type	1	0.000092
Price	0	0.000000
Content Rating	1	0.000092
Genres	0	0.000000
Last Updated	0	0.000000
Current Ver	8	0.000738
Android Ver	3	0.000277

	Rating	Reviews	...	cr_Everyone
0	4.1	159	...	1
1	3.9	199	...	1
2	4.7	214	...	1

	Rating	Reviews	...	cr_Everyone
0	4.1	159	...	1
1	3.9	199	...	1
2	4.7	214	...	1

	Rating	Reviews	...	cr_Everyone
0	4.1	159	...	1
1	3.9	199	...	1
2	4.7	214	...	1