#初始化程序
import pandas as pd
import numpy as np
import time
import datetime
from pyecharts.charts import Pie, Page, Line
from pyecharts import options as opts
#防止无关报错
import warnings
warnings.filterwarnings("ignore")
#pd解决中文编码问题
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
#读入数据
data1_2_1 = pd.read_csv('../../result/result1/task1_2_1.csv', index_col=0, header=0)
data1_2_1
#开始任务3.1
data_consume = data1_2_1[data1_2_1['Type'] == '消费'] #只统计 消费 的订单
data_consume['Date'] = pd.to_datetime(data_consume['Date'])
data_consume = data_consume.sort_values(by = 'Date', ascending=True, ignore_index=True)
data_consume
#统一这个月的 每个人的刷卡频次 和 每个人的消费额
data_person = data_consume.groupby('CardNo', as_index = False).agg({'PeoNo': np.size, 'Money': np.sum})
data_person = data_person.rename(columns={'PeoNo':'Num'})
data_person
#输出 人均刷卡频次 和 人均消费额
print('人均刷卡频次', (data_person['Num'].sum() / (data_person.tail(1).index + 1) ).astype('int') ) #人均刷卡频次
print('人均消费额', (data_person['Money'].sum() / (data_person.tail(1).index + 1) ).astype('float').map(lambda x:("%.2f")%x) ) #人均刷卡频次
#分别按 专业 性别 消费类型 分组
data_profession = data_consume.groupby(['Major', 'Sex', 'Dept'], as_index = False).agg({'PeoNo': np.size, 'Money': np.sum})
data_profession = data_profession.rename(columns={'PeoNo':'Num'})
data_profession
#保存前三个专业
majors = list(set(list(data_profession['Major']) ) )[0:3]
print(majors)
#绘制 三个专业不同性别学生的消费类型的频次 饼图
page = Page(layout=Page.DraggablePageLayout) #用于拼接多张图
for major in majors:
data_temporary = data_profession[data_profession['Major'] == major]
male = data_temporary[data_temporary['Sex'] == '男'] #男生数据
female = data_temporary[data_temporary['Sex'] == '女'] #女生数据
#男生饼图
pie_male = (Pie()
.add('', [list(z) for z in zip(list(male['Dept']), list(male['Num']) )],
radius=["30%", "75%"],
rosetype="radius")
.set_global_opts(title_opts=opts.TitleOpts(title = major + ' 男生 消费类型'),
legend_opts=opts.LegendOpts(type_="scroll", pos_right="right", orient="vertical"), #图例居右
)
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {d}%"))
)
#女生饼图
pie_female = (Pie()
.add('', [list(z) for z in zip(list(female['Dept']), list(female['Num']) )],
radius=["30%", "75%"],
rosetype="radius")
.set_global_opts(title_opts=opts.TitleOpts(title = major + ' 女生 消费类型'),
legend_opts=opts.LegendOpts(type_="scroll", pos_right="right", orient="vertical"), #图例居右
)
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {d}%"))
)
page.add(pie_male) #添加到page,拼接图片
page.add(pie_female) #添加到page,拼接图片
page.render_notebook()
#下面开始机器学习测试
data_tree = data_consume[['Sex', 'Major', 'Dept']]
data_tree
data_consume
data_dummies = pd.get_dummies(data_tree)
data_dummies
print(data_dummies.columns.values.tolist())
#设置 feature
features = data_dummies.loc[:, 'Sex_女':'Major_18首饰设计']
# features_taget = data_dummies.loc[:, 'Dept_人文社科':'Dept_飞凤轩宿管办']
#提取numpy数组
X = features.values
# y = features_taget.values
print(X.shape)
features
#设置feature_target 目标变量
y_df = data_dummies.loc[:, 'Dept_人文社科':'Dept_飞凤轩宿管办']
feature_target_name = y_df.columns.values.tolist() #保存每个索引对应的名称
y_np = y_df.values
[rows, cols] = y_np.shape
#将每个分类的索引添加到列表,并转为np数组
features_taget = []
for i in range(rows):
features_taget.append(np.argwhere(y_np[i] == 1)[0][0] )
features_taget = np.array(features_taget)
print(features_taget)
y = features_taget
print(y.shape)
y_df
feature_target_name
y
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3)
print(X_train)
print(y_train)
from sklearn.tree import DecisionTreeClassifier
# clf = DecisionTreeRegressor(n_neighbors=1, n_jobs=-1)
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))
prediction = tree.predict([X_train[248]])
prediction = int(prediction)
feature_target_name[prediction]
prediction = tree.predict([X_train[248]])
prediction
prediction = int(prediction)
feature_target_name[prediction]