2022高教社杯全国大学生数学建模竞赛C题 问题二(1) Python代码


    • 问题 2
      • 2.1 依据附件数据分析高钾玻璃、铅钡玻璃的分类规律
        • 数据类别编码
        • 不平衡数据处理
        • 分类模型
          • 决策树分类
          • 随机森林分类
          • XGBoost分类
          • LightGBM分类
          • Catboost分类
          • 基于直方图的梯度提升Histogram-Based Gradient Boosting
          • 梯度提升树Gradient Boosting Tree
          • 逻辑回归Logistic
          • 朴素贝叶斯Naive Bayes
          • 支持向量机SVM
          • 神经网络Neural network

问题 2

2.1 依据附件数据分析高钾玻璃、铅钡玻璃的分类规律


d12 = d12.drop('rowSum', axis=1)
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder# data encode
# Check for and handle categorical variables
label_encoder = LabelEncoder()
x_categorical = d12.select_dtypes(include=['object']).apply(label_encoder.fit_transform)
x_numerical = d12.select_dtypes(exclude=['object']).valuesdf_encode = pd.concat([pd.DataFrame(x_numerical), x_categorical], axis=1)
# rename columns
colnames = list(d12.columns[i] for i in ([0] + list(range(6,20)))) + list(df_encode.columns[i] for i in list(range(15,21)))
df_encode.columns = colnames

5 rows × 21 columns

from sklearn.model_selection import train_test_split
X = df_encode.drop('类型', axis=1)
y = df_encode['类型']X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) 
铅钡    49
高钾    18
Name: count, dtype: int64
0    49
1    18
Name: count, dtype: int64
from imblearn.over_sampling import SMOTEoversample = SMOTE()
X_train_smote, y_train_smote = oversample.fit_resample(X_train, y_train)
0    37
1    37
Name: count, dtype: int64


# Load libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics 
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()
# Train Decision Tree Classifer
clf = clf.fit(X_train_smote, y_train_smote)
#Predict the response for test dataset
y_pred = clf.predict(X_test)print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0precision    recall  f1-score   support0       1.00      1.00      1.00        121       1.00      1.00      1.00         2accuracy                           1.00        14macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplaycf_mat = confusion_matrix(y_test, y_pred)
print(cf_mat)disp = ConfusionMatrixDisplay(confusion_matrix=cf_mat,display_labels=clf.classes_)
[[12  0][ 0  2]]



from sklearn import tree
text_representation = tree.export_text(clf)
|--- feature_9 <= 5.46
|   |--- class: 1
|--- feature_9 >  5.46
|   |--- class: 0
fig = plt.figure(figsize=(25,20))
my_plot = tree.plot_tree(clf, feature_names=list(X.columns),  class_names=['高钾','铅钡'],filled=True)


#define metrics
y_pred_proba = clf.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
fig = plt.figure(figsize=(10,8))#create ROC curve
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.rc('font', size=20)  
plt.rc('figure', titlesize=20)


import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn import tree
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_splitrf = RandomForestClassifier(n_estimators=500,random_state=0)
rf.fit(X_train_smote, y_train_smote)#Predict the response for test dataset
y_pred = clf.predict(X_test)print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0precision    recall  f1-score   support0       1.00      1.00      1.00        121       1.00      1.00      1.00         2accuracy                           1.00        14macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14
fn = list(X_train_smote.columns) 
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=800)
my_plot = tree.plot_tree(rf.estimators_[200],feature_names = fn, class_names=['高钾','铅钡'],filled = True)


import xgboost as xgb# Use "hist" for constructing the trees, with early stopping enabled.
xgb = xgb.XGBClassifier(tree_method="hist", early_stopping_rounds=2)
# Fit the model, test sets are used for early stopping.
xgb.fit(X_train_smote, y_train_smote)#Predict the response for test dataset
y_pred = xgb.predict(X_test)print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0precision    recall  f1-score   support0       1.00      1.00      1.00        121       1.00      1.00      1.00         2accuracy                           1.00        14macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14
# plot single tree


from lightgbm import LGBMClassifiergbm = LGBMClassifier()
gbm.fit(X_train_smote, y_train_smote)
#Predict the response for test dataset
y_pred = gbm.predict(X_test)print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0precision    recall  f1-score   support0       1.00      1.00      1.00        121       1.00      1.00      1.00         2accuracy                           1.00        14macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14
from catboost import CatBoostClassifier
cat = CatBoostClassifier(verbose=0, n_estimators=100)
cat.fit(X_train_smote, y_train_smote)#Predict the response for test dataset
y_pred = cat.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0precision    recall  f1-score   support0       1.00      1.00      1.00        121       1.00      1.00      1.00         2accuracy                           1.00        14macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14
基于直方图的梯度提升Histogram-Based Gradient Boosting
from sklearn.ensemble import HistGradientBoostingClassifierhbg = HistGradientBoostingClassifier()
hbg.fit(X_train_smote, y_train_smote)#Predict the response for test dataset
y_pred = hbg.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0precision    recall  f1-score   support0       1.00      1.00      1.00        121       1.00      1.00      1.00         2accuracy                           1.00        14macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14
梯度提升树Gradient Boosting Tree
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
gb.fit(X_train_smote, y_train_smote)y_pred = gb.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0precision    recall  f1-score   support0       1.00      1.00      1.00        121       1.00      1.00      1.00         2accuracy                           1.00        14macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14
from sklearn import linear_model
import numpylogr = linear_model.LogisticRegression()
logr.fit(X_train_smote, y_train_smote)y_pred = gb.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0precision    recall  f1-score   support0       1.00      1.00      1.00        121       1.00      1.00      1.00         2accuracy                           1.00        14macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14
log_odds = logr.coef_
odds = numpy.exp(log_odds)
模型的回归系数:array([[0.82909358, 1.21679202, 1.00751889, 1.15839779, 1.06619743,0.99198439, 0.96850246, 0.99970948, 1.01100367, 0.75457187,0.91357586, 0.99337601, 0.99650844, 1.00080449, 1.00000987,1.020977  , 1.02869626, 0.99088894, 0.8292335 , 0.98475949]])
朴素贝叶斯Naive Bayes
from sklearn.naive_bayes import GaussianNBgnb = GaussianNB()
y_pred = gnb.fit(X_train_smote, y_train_smote)y_pred = gnb.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0precision    recall  f1-score   support0       1.00      1.00      1.00        121       1.00      1.00      1.00         2accuracy                           1.00        14macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14
from sklearn import svm
svm = svm.SVC()
svm.fit(X_train_smote, y_train_smote)y_pred = svm.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0precision    recall  f1-score   support0       1.00      1.00      1.00        121       1.00      1.00      1.00         2accuracy                           1.00        14macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14
神经网络Neural network
from sklearn.neural_network import MLPClassifiernn = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
nn.fit(X_train_smote, y_train_smote)y_pred = nn.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0precision    recall  f1-score   support0       1.00      1.00      1.00        121       1.00      1.00      1.00         2accuracy                           1.00        14macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14


