本文共 7148 字,大约阅读时间需要 23 分钟。
import pandas as pdimport matplotlib.pyplot as pltfrom sklearn import metricsimport numpy as npfrom sklearn.model_selection import train_test_splitfrom sklearn.tree import DecisionTreeClassifierfrom sklearn import kernel_approximationfrom sklearn.linear_model import SGDClassifierfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.ensemble import GradientBoostingClassifierimport warningswarnings.filterwarnings("ignore")data = pd.read_csv('https://raw.githubusercontent.com/wzy6642/Machine-Learning-Case-Studies/master/noshowappointments/data/No-show-Issue-Comma-300k.csv')print(len(data))data.head()for column in list(data.columns): # {0:25}意味着第一个索引(即列)中的特征将被打印,并且将为其分配25个字符空间。 # nuinque()是查看该序列(axis=0/1对应着列或行)的不同值的数量。用这个函数可以查看数据有多少个不同值。 print("{0:25} {1}".format(column, data[column].nunique()))def features_plots(discrete_vars): plt.figure(figsize=(15, 24.5)) for i, cv in enumerate(['Age', 'AwaitingTime']): plt.subplot(7, 2, i+1) # 这个参数指定bin(箱子)的个数,也就是总共有几条条状图 plt.hist(data[cv], bins=len(data[cv].unique())) plt.title(cv) plt.ylabel('Frequency') for i, dv in enumerate(discrete_vars): plt.subplot(7, 2, i+3) data[dv].value_counts().plot(kind='bar', title=dv) plt.ylabel('Frequency')discrete_vars = ['Gender', 'DayOfTheWeek', 'Status', 'Diabetes', 'Alcoolism', 'HiperTension', 'Handcap', 'Smokes', 'Scholarship', 'Tuberculosis', 'Sms_Reminder']features_plots(discrete_vars)data[data['Age'] < 0]['Age'].value_counts().sum()data = data[data['Age'] >= 0]del data['Handcap']data['AwaitingTime'] = data['AwaitingTime'].apply(lambda x: abs(x))dow_mapping = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6}data['DayOfTheWeek'] = data['DayOfTheWeek'].map(dow_mapping)for field in ['Gender', 'Status']: # pd.Categorical( list ).codes 这样就可以直接得到原始数据的对应的序号列表,通过这样的处理可以将类别信息转化成数值信息 data[field] = pd.Categorical(list(data[field])).codesdiscrete_vars = ['Gender', 'DayOfTheWeek', 'Status', 'Diabetes', 'Alcoolism', 'HiperTension', 'Smokes', 'Scholarship', 'Tuberculosis', 'Sms_Reminder']features_plots(discrete_vars)plt.scatter(data['Age'], data['AwaitingTime'], s=0.5)plt.title('Scatter plot of Age and Awaiting Time')plt.xlabel('Age')plt.ylabel('Awaiting Time')plt.xlim(0, 120)plt.ylim(0, 120)pd.set_option('display.width', 100)pd.set_option('precision', 3)correlations = data[['Age', 'AwaitingTime']].corr(method='pearson')print(correlations)data_dow_status = data.groupby(['Sms_Reminder', 'Status'])['Sms_Reminder'].count().unstack('Status').fillna(0)data_dow_status[[0, 1]].plot(kind='bar', stacked=True)plt.title('Frequency of people showing up and not showing up by number of SMS reminders sent')plt.xlabel('Number of SMS reminders')plt.ylabel('Frequency')data_dow_status = data.groupby(['DayOfTheWeek', 'Status'])['DayOfTheWeek'].count().unstack('Status').fillna(0)data_dow_status[[0, 1]].plot(kind='bar', stacked=True)plt.title('Frequency of people showing up and not showing up by Day of the week')plt.xlabel('Day of the week')plt.ylabel('Frequency')data.boxplot(column=['Age'], return_type='axes', by='Status')plt.show()plt.figure(figsize=(15, 3.5))for i, status in enumerate(['no show ups', 'show ups']): data_show = data[data['Status']==i] plt.subplot(1, 2, i+1) for gender in [0, 1]: data_gender = data_show[data_show['Gender']==gender] freq_age = data_gender['Age'].value_counts().sort_index() freq_age.plot() plt.title('Age wise frequency of patient %s for both genders' % status) plt.xlabel('Age') plt.ylabel('Frequency') plt.legend(['Female', 'Male'], loc='upper left')data.boxplot(column=['AwaitingTime'], return_type='axes', by='Status')plt.show()for col in ['AppointmentRegistration', 'ApointmentData']: for index, component in enumerate(['year', 'month', 'day']): data['%s_%s' % (col, component)] = data[col].apply(lambda x: int(x.split('T')[0].split('-')[index])) for index, component in enumerate(['hour', 'min', 'sec']): data['%s_%s' % ('AppointmentRegistration', component)] = data['AppointmentRegistration'].apply(lambda x: int(x.split('T')[1][:-1].split(':')[index]))data.head()#分类def model_performance(model, model_name, X_train, y_train, y_test, Y_pred): print('Model name: %s' % model_name) # 分类准确率分数是指所有分类正确的百分比 print('Test accuracy (Accuracy Score): %f' % metrics.accuracy_score(y_test, Y_pred)) # 直接根据真实值(必须是二值)、预测值(可以是0/1,也可以是proba值)计算出auc值 print('Test accuracy (ROC AUC Score): %f' % metrics.roc_auc_score(y_test, Y_pred)) # 模型精度 print('Train accuracy: %f' % model.score(X_train, y_train)) # precision 、recall 、thresholds fpr, tpr, thresholds = metrics.precision_recall_curve(y_test, Y_pred) # 计算AUC值,其中x,y分别为数组形式,根据(xi,yi)在坐标上的点,生成的曲线,然后计算AUC值 print('Area Under the Precision-Recall Curve: %f' % metrics.auc(fpr, tpr)) # 纵坐标:真正率/横坐标:假正率 false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_test, Y_pred) roc_auc = metrics.auc(false_positive_rate, true_positive_rate) plt.title('Receiver Operating Characteristic') # 绘制ROC_AUC曲线图 plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([-0.1, 1.2]) plt.ylim([-0.1, 1.2]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.show()features_of_choice = ['Age', 'Gender', 'DayOfTheWeek', 'Diabetes', 'Alcoolism', 'HiperTension', 'Smokes', 'Scholarship', 'Tuberculosis', 'Sms_Reminder', 'AwaitingTime', 'AppointmentRegistration_year', 'AppointmentRegistration_month', 'AppointmentRegistration_day', 'AppointmentRegistration_hour', 'AppointmentRegistration_min', 'AppointmentRegistration_sec', 'ApointmentData_year', 'ApointmentData_month','ApointmentData_day']x = np.array(data[features_of_choice])y = np.array(data['Status'])x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)#决策树clf = DecisionTreeClassifier()clf.fit(x_train, y_train)y_pred = clf.predict(x_test)model_performance(clf, 'Decision tree classifier', x_train, y_train, y_test, y_pred)#SGD分类器#用核近似和SGD分类器训练模型rbf_feature = kernel_approximation.RBFSampler(gamma=1, random_state=1)X_train = rbf_feature.fit_transform(x_train)clf = SGDClassifier()clf.fit(X_train, y_train)X_test = rbf_feature.fit_transform(x_test)Y_pred = clf.predict(X_test)model_performance(clf, 'Kernel approximation', X_train, y_train, y_test, Y_pred)#随机森林clf = RandomForestClassifier()clf.fit(x_train, y_train)y_pred = clf.predict(x_test)model_performance(clf, 'Random Forest', x_train, y_train, y_test, y_pred)#梯度Boostingclf = GradientBoostingClassifier(random_state=10, learning_rate=0.1, n_estimators=200, max_depth=5, max_features=10)clf.fit(x_train, y_train)y_pred = clf.predict(x_test)model_performance(clf, 'Grandient Boosting', x_train, y_train, y_test, y_pred)for feature, score in zip(features_of_choice, list(clf.feature_importances_)): print('%s\t\t\t\t\t%f' % (feature, score))
转载地址:http://lpssi.baihongyu.com/