import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans,AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import datetime
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
air_data=pd.read_csv('air_data.csv',encoding='ANSI')
print(air_data.head())
print(air_data.info())
exp1 = air_data['SUM_YR_1'].notnull()
exp2 = air_data['SUM_YR_2'].notnull()
air_data = air_data.loc[exp1 & exp2, :]
print(air_data.shape)
index1 = air_data['SUM_YR_1'] == 0
index2 = air_data['SUM_YR_2'] == 0
index3 = air_data['avg_discount'] > 0
index4 = air_data['SEG_KM_SUM'] > 0
airline = air_data.loc[-(index1&index2&index3&index4), :]
print(air_data.shape)
new_data=air_data[['LOAD_TIME','FFP_DATE','LAST_TO_END','FLIGHT_COUNT','SEG_KM_SUM','avg_discount']]
print(new_data.info())
L = pd.to_datetime(new_data['LOAD_TIME'])-pd.to_datetime(new_data['FFP_DATE'])
L = np.int64(L.astype(str).str.split().str[0])
L=pd.DataFrame(L,columns=['Days'])
air_features = pd.concat([L, new_data.iloc[:, 2:]], axis=1)
print(air_features.head())
air_features.columns=['L(客户时长)','R(消费时间间隔)','F(消费频率)','M(总飞行里程)','C(平均折扣率)']
print(air_features.describe().T)
air_features=air_features.dropna()
air_features=air_features.reset_index(drop=True)
print(air_features.info())
data_scale=StandardScaler().fit_transform(air_features)
data_scale=pd.DataFrame(data_scale,columns=air_features.columns)
print(data_scale.head())
sse=[]
for i in range(2,10):
result=KMeans(i,random_state=100).fit(data_scale)
sse.append(result.inertia_)
plt.figure()
plt.plot(range(2,10),sse,marker='o')
plt.xlabel('k')
plt.ylabel('sse')
plt.show()
sil=[]
for i in range(2,8):
result=KMeans(i,random_state=100).fit(data_scale)
sil.append(silhouette_score(air_features,result.labels_))
plt.figure()
plt.plot(range(2,8),sil,marker='o')
plt.xlabel('k')
plt.ylabel('sil')
plt.show()
result=KMeans(3,random_state=100).fit(data_scale)
print(pd.Series(result.labels_).value_counts())
pca_2=PCA(n_components=2)
data_pca_2=pd.DataFrame(pca_2.fit_transform(data_scale))
plt.figure()
plt.scatter(data_pca_2[0],data_pca_2[1],c=result.labels_)
plt.show()
air_features['cluster']=result.labels_
print(air_features.head())
data_pivot=pd.pivot_table(air_features,index=['cluster'],aggfunc='mean')
print(data_pivot)
def customer_type(cluser):
if cluser==0:
return '重要发展客户'
elif cluser==1:
return '最重要客户'
else:
return '一般客户'
air_features['客户类型']=air_features['cluster'].apply(customer_type)
print(air_features.head())
print(air_features['客户类型'].value_counts())
customer_count=air_features['客户类型'].value_counts()
plt.figure()
plt.subplot(1,2,1)
plt.bar(customer_count.index,customer_count.values)
for a,b in zip(customer_count.index,customer_count.values):
plt.text(a,b,b,ha='center',va='bottom',color='b')
plt.subplot(1,2,2)
plt.pie(customer_count.values,labels=list(customer_count.index),autopct='%.1f%%', textprops={'color':'r'},shadow=True)
plt.legend(loc=1)
plt.show()