#Import Library and Load File
import pandas as pd
import numpy as npdf = pd.read_csv('/kaggle/input/mall-customers/Mall_Customers.csv')
df.info() #checking data types and total null values
数据框摘要图
从输出结果中,我们可以看到数据框中有 5 列和 200 行,数据中没有空值。
让我们检查一下数据框中是否有任何重复的行。
#Checking If any duplicated values
print(f'Total Duplicated Rows : {df.duplicated().sum()}')
继续,我们来检查一下从 0 到 100 的每个数字列的百分位总结。
#Let's see the percentile from each numerical columns from the dataset
def percentile(df, column) :
print(f'{column} Percentile Summary :')
for a in range(0,101,10) :
print(f'- {a}th Percentile : {round(np.percentile(df[column],a),2)}')
#Percentile for Age
percentile(df, 'Age')#Annual Income Percentile
percentile(df,'Annual Income (k$)')#Spending Score Percentile
percentile(df,'Spending Score (1-100)')
#Count Each Gender total
gender_total = df['Genre'].value_counts().reset_index()
gender_total['perc_genre'] = round(gender_total['Genre']/sum(gender_total['Genre']),2)*100
gender_total
顾客性别数量
上文中,我们检查了 null、重复值、并显示了数字列的百分位数、和分类列中每个唯一值的总值。
接下来,我们将开始探索上面的一些数据,以更好地了解我们的数据集。
2
探索性数据分析
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
num_cols = ['Age','Annual Income (k$)','Spending Score (1-100)']
def plot_stats(df, col_list) :
for a in num_cols :
fig,ax = plt.subplots(1,2, figsize = (9,6))
sns.distplot(df[a], ax = ax[0])
sns.boxplot(df[a], ax = ax[1])
ax[0].axvline(df[a].mean(), linestyle = '--', linewidth = 2, color = 'green')
ax[0].axvline(df[a].median(), linestyle = '--', linewidth = 2 , color = 'red')
ax[0].set_ylabel('Frequency')
ax[0].set_title('Distribution Plot')
ax[1].set_title('Box Plot')
plt.suptitle(a)
plt.show()
plot_stats(df, num_cols)
#Flooring and Capping by replacing outliers with 10th and 90th Percentile
#Age 10th Percentile and 90th Percentile
tenth_percentile_age = np.percentile(df['Age'], 10)
ninetieth_percentile_age = np.percentile(df['Age'], 90)
df['Age'] = np.where(df['Age'] < tenth_percentile_age, tenth_percentile_age, df['Age'])
df['Age'] = np.where(df['Age'] > ninetieth_percentile_age, ninetieth_percentile_age, df['Age'])
#Annual Income 10th Percentile and 90th Percentile
tenth_percentile_annualincome = np.percentile(df['Annual Income (k$)'], 10)
ninetieth_percentile_annualincome = np.percentile(df['Annual Income (k$)'], 90)
df['Annual Income (k$)'] = np.where(df['Annual Income (k$)'] < tenth_percentile_annualincome, tenth_percentile_annualincome, df['Annual Income (k$)'])
df['Annual Income (k$)'] = np.where(df['Annual Income (k$)'] > ninetieth_percentile_annualincome, ninetieth_percentile_annualincome, df['Annual Income (k$)'])plot_stats(df, num_cols) #Checking Distribution after replacing outliers with 10th and 90th Percentile
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans#Normalize Numeric Features
scaled_features = MinMaxScaler().fit_transform(df.iloc[:,3:5])
#Get 2 Principal Components
pca = PCA(n_components = 2).fit(scaled_features)
features_2d = pca.transform(scaled_features)#5 Centroids Model
model = KMeans(n_clusters = 5, init= 'k-means++', n_init = 100, max_iter = 1000, random_state=16)
#Fit to the data and predict the cluster assignments to each data points
feature = df.iloc[:,3:5]
km_clusters = model.fit_predict(feature.values)
km_clusters