博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
python EDA数据分析例子(二分类问题,源代码)
阅读量:4106 次
发布时间:2019-05-25

本文共 7148 字,大约阅读时间需要 23 分钟。

import pandas as pdimport matplotlib.pyplot as pltfrom sklearn import metricsimport numpy as npfrom sklearn.model_selection import train_test_splitfrom sklearn.tree import DecisionTreeClassifierfrom sklearn import kernel_approximationfrom sklearn.linear_model import SGDClassifierfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.ensemble import GradientBoostingClassifierimport warningswarnings.filterwarnings("ignore")data = pd.read_csv('https://raw.githubusercontent.com/wzy6642/Machine-Learning-Case-Studies/master/noshowappointments/data/No-show-Issue-Comma-300k.csv')print(len(data))data.head()for column in list(data.columns):    # {0:25}意味着第一个索引(即列)中的特征将被打印,并且将为其分配25个字符空间。    # nuinque()是查看该序列(axis=0/1对应着列或行)的不同值的数量。用这个函数可以查看数据有多少个不同值。    print("{0:25} {1}".format(column, data[column].nunique()))def features_plots(discrete_vars):    plt.figure(figsize=(15, 24.5))    for i, cv in enumerate(['Age', 'AwaitingTime']):        plt.subplot(7, 2, i+1)        # 这个参数指定bin(箱子)的个数,也就是总共有几条条状图        plt.hist(data[cv], bins=len(data[cv].unique()))        plt.title(cv)        plt.ylabel('Frequency')    for i, dv in enumerate(discrete_vars):        plt.subplot(7, 2, i+3)        data[dv].value_counts().plot(kind='bar', title=dv)        plt.ylabel('Frequency')discrete_vars = ['Gender', 'DayOfTheWeek', 'Status', 'Diabetes', 'Alcoolism', 'HiperTension',                 'Handcap', 'Smokes', 'Scholarship', 'Tuberculosis', 'Sms_Reminder']features_plots(discrete_vars)data[data['Age'] < 0]['Age'].value_counts().sum()data = data[data['Age'] >= 0]del data['Handcap']data['AwaitingTime'] = data['AwaitingTime'].apply(lambda x: abs(x))dow_mapping = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6}data['DayOfTheWeek'] = data['DayOfTheWeek'].map(dow_mapping)for field in ['Gender', 'Status']:    # pd.Categorical( list ).codes 这样就可以直接得到原始数据的对应的序号列表,通过这样的处理可以将类别信息转化成数值信息    data[field] = pd.Categorical(list(data[field])).codesdiscrete_vars = ['Gender', 'DayOfTheWeek', 'Status', 'Diabetes', 'Alcoolism', 'HiperTension',                 'Smokes', 'Scholarship', 'Tuberculosis', 'Sms_Reminder']features_plots(discrete_vars)plt.scatter(data['Age'], data['AwaitingTime'], s=0.5)plt.title('Scatter plot of Age and Awaiting Time')plt.xlabel('Age')plt.ylabel('Awaiting Time')plt.xlim(0, 120)plt.ylim(0, 120)pd.set_option('display.width', 100)pd.set_option('precision', 3)correlations = data[['Age', 'AwaitingTime']].corr(method='pearson')print(correlations)data_dow_status = data.groupby(['Sms_Reminder', 'Status'])['Sms_Reminder'].count().unstack('Status').fillna(0)data_dow_status[[0, 1]].plot(kind='bar', stacked=True)plt.title('Frequency of people showing up and not showing up by number of SMS reminders sent')plt.xlabel('Number of SMS reminders')plt.ylabel('Frequency')data_dow_status = data.groupby(['DayOfTheWeek', 'Status'])['DayOfTheWeek'].count().unstack('Status').fillna(0)data_dow_status[[0, 1]].plot(kind='bar', stacked=True)plt.title('Frequency of people showing up and not showing up by Day of the week')plt.xlabel('Day of the week')plt.ylabel('Frequency')data.boxplot(column=['Age'], return_type='axes', by='Status')plt.show()plt.figure(figsize=(15, 3.5))for i, status in enumerate(['no show ups', 'show ups']):    data_show = data[data['Status']==i]    plt.subplot(1, 2, i+1)    for gender in [0, 1]:        data_gender = data_show[data_show['Gender']==gender]        freq_age = data_gender['Age'].value_counts().sort_index()        freq_age.plot()    plt.title('Age wise frequency of patient %s for both genders' % status)    plt.xlabel('Age')    plt.ylabel('Frequency')    plt.legend(['Female', 'Male'], loc='upper left')data.boxplot(column=['AwaitingTime'], return_type='axes', by='Status')plt.show()for col in ['AppointmentRegistration', 'ApointmentData']:    for index, component in enumerate(['year', 'month', 'day']):        data['%s_%s' % (col, component)] = data[col].apply(lambda x: int(x.split('T')[0].split('-')[index]))    for index, component in enumerate(['hour', 'min', 'sec']):        data['%s_%s' % ('AppointmentRegistration', component)] = data['AppointmentRegistration'].apply(lambda x: int(x.split('T')[1][:-1].split(':')[index]))data.head()#分类def model_performance(model, model_name, X_train, y_train, y_test, Y_pred):    print('Model name: %s' % model_name)    # 分类准确率分数是指所有分类正确的百分比    print('Test accuracy (Accuracy Score): %f' % metrics.accuracy_score(y_test, Y_pred))    # 直接根据真实值(必须是二值)、预测值(可以是0/1,也可以是proba值)计算出auc值    print('Test accuracy (ROC AUC Score): %f' % metrics.roc_auc_score(y_test, Y_pred))    # 模型精度    print('Train accuracy: %f' % model.score(X_train, y_train))    # precision 、recall 、thresholds    fpr, tpr, thresholds = metrics.precision_recall_curve(y_test, Y_pred)    # 计算AUC值,其中x,y分别为数组形式,根据(xi,yi)在坐标上的点,生成的曲线,然后计算AUC值    print('Area Under the Precision-Recall Curve: %f' % metrics.auc(fpr, tpr))    # 纵坐标:真正率/横坐标:假正率    false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_test, Y_pred)    roc_auc = metrics.auc(false_positive_rate, true_positive_rate)    plt.title('Receiver Operating Characteristic')    # 绘制ROC_AUC曲线图    plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f' % roc_auc)    plt.legend(loc='lower right')    plt.plot([0, 1], [0, 1], 'r--')    plt.xlim([-0.1, 1.2])    plt.ylim([-0.1, 1.2])    plt.ylabel('True Positive Rate')    plt.xlabel('False Positive Rate')    plt.show()features_of_choice = ['Age', 'Gender', 'DayOfTheWeek', 'Diabetes', 'Alcoolism', 'HiperTension', 'Smokes', 'Scholarship', 'Tuberculosis',                      'Sms_Reminder', 'AwaitingTime', 'AppointmentRegistration_year', 'AppointmentRegistration_month',                      'AppointmentRegistration_day', 'AppointmentRegistration_hour', 'AppointmentRegistration_min',                      'AppointmentRegistration_sec', 'ApointmentData_year', 'ApointmentData_month','ApointmentData_day']x = np.array(data[features_of_choice])y = np.array(data['Status'])x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)#决策树clf = DecisionTreeClassifier()clf.fit(x_train, y_train)y_pred = clf.predict(x_test)model_performance(clf, 'Decision tree classifier', x_train, y_train, y_test, y_pred)#SGD分类器#用核近似和SGD分类器训练模型rbf_feature = kernel_approximation.RBFSampler(gamma=1, random_state=1)X_train = rbf_feature.fit_transform(x_train)clf = SGDClassifier()clf.fit(X_train, y_train)X_test = rbf_feature.fit_transform(x_test)Y_pred = clf.predict(X_test)model_performance(clf, 'Kernel approximation', X_train, y_train, y_test, Y_pred)#随机森林clf = RandomForestClassifier()clf.fit(x_train, y_train)y_pred = clf.predict(x_test)model_performance(clf, 'Random Forest', x_train, y_train, y_test, y_pred)#梯度Boostingclf = GradientBoostingClassifier(random_state=10, learning_rate=0.1, n_estimators=200, max_depth=5, max_features=10)clf.fit(x_train, y_train)y_pred = clf.predict(x_test)model_performance(clf, 'Grandient Boosting', x_train, y_train, y_test, y_pred)for feature, score in zip(features_of_choice, list(clf.feature_importances_)):    print('%s\t\t\t\t\t%f' % (feature, score))

转载地址:http://lpssi.baihongyu.com/

你可能感兴趣的文章
XHProf-php轻量级的性能分析工具
查看>>
PHP7新特性 What will be in PHP 7/PHPNG
查看>>
比较strtr, str_replace和preg_replace三个函数的效率
查看>>
ubuntu 下编译PHP5.5.7问题:configure: error: freetype.h not found.
查看>>
PHP编译configure时常见错误 debian centos
查看>>
configure: error: Please reinstall the BZip2 distribution
查看>>
OpenCV gpu模块样例注释:video_reader.cpp
查看>>
【增强学习在无人驾驶中的应用】
查看>>
OpenCV meanshift目标跟踪总结
查看>>
人工神经网络——神经元模型介绍
查看>>
今天,Python信息量很大!
查看>>
Flash 已死,Deno 当立?
查看>>
编程差的程序员,90%都是吃了数学的亏!骨灰级开发:方法不对,努力也白费...
查看>>
都无代码了,还要程序员吗?
查看>>
面试想拿 10K,HR 说我只配7k?
查看>>
那些人生“开挂”的程序员,都在干什么?
查看>>
影响科学圈的那些计算机代码
查看>>
乐视视频 App 图标改为“欠 122 亿”,网友:我在别家分红包,却在你家随份子!...
查看>>
为何程序员总喜欢写技术博客,看完恍然大悟...
查看>>
如何判断一家互联网公司要倒闭了?
查看>>