2022高教社杯全国大学生数学建模竞赛C题 问题二(1) Python代码
目录
- 问题 2
- 2.1 依据附件数据分析高钾玻璃、铅钡玻璃的分类规律
- 数据类别编码
- 不平衡数据处理
- 分类模型
- 决策树分类
- 随机森林分类
- XGBoost分类
- LightGBM分类
- Catboost分类
- 基于直方图的梯度提升Histogram-Based Gradient Boosting
- 梯度提升树Gradient Boosting Tree
- 逻辑回归Logistic
- 朴素贝叶斯Naive Bayes
- 支持向量机SVM
- 神经网络Neural network
问题 2
2.1 依据附件数据分析高钾玻璃、铅钡玻璃的分类规律
数据类别编码
d12 = d12.drop('rowSum', axis=1)
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder# data encode
# Check for and handle categorical variables
label_encoder = LabelEncoder()
x_categorical = d12.select_dtypes(include=['object']).apply(label_encoder.fit_transform)
x_numerical = d12.select_dtypes(exclude=['object']).valuesdf_encode = pd.concat([pd.DataFrame(x_numerical), x_categorical], axis=1)
# rename columns
colnames = list(d12.columns[i] for i in ([0] + list(range(6,20)))) + list(df_encode.columns[i] for i in list(range(15,21)))
df_encode.columns = colnames
df_encode.head()
文物编号 | 二氧化硅(SiO2) | 氧化钠(Na2O) | 氧化钾(K2O) | 氧化钙(CaO) | 氧化镁(MgO) | 氧化铝(Al2O3) | 氧化铁(Fe2O3) | 氧化铜(CuO) | 氧化铅(PbO) | ... | 五氧化二磷(P2O5) | 氧化锶(SrO) | 氧化锡(SnO2) | 二氧化硫(SO2) | 纹饰 | 类型 | 颜色 | 表面风化 | 文物采样点 | 风化标记 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | 69.33 | 0.0 | 9.99 | 6.32 | 0.87 | 3.93 | 1.74 | 3.87 | 0.00 | ... | 1.17 | 0.00 | 0.0 | 0.39 | 2 | 1 | 6 | 0 | 0 | 1 |
1 | 2.0 | 36.28 | 0.0 | 1.05 | 2.34 | 1.18 | 5.73 | 1.86 | 0.26 | 47.43 | ... | 3.57 | 0.19 | 0.0 | 0.00 | 0 | 0 | 1 | 1 | 1 | 1 |
2 | 3.0 | 87.05 | 0.0 | 5.19 | 2.01 | 0.00 | 4.06 | 0.00 | 0.78 | 0.25 | ... | 0.66 | 0.00 | 0.0 | 0.00 | 0 | 1 | 6 | 0 | 2 | 1 |
3 | 3.0 | 61.71 | 0.0 | 12.37 | 5.87 | 1.11 | 5.50 | 2.16 | 5.09 | 1.41 | ... | 0.70 | 0.10 | 0.0 | 0.00 | 0 | 1 | 6 | 0 | 3 | 1 |
4 | 4.0 | 65.88 | 0.0 | 9.67 | 7.12 | 1.56 | 6.44 | 2.06 | 2.18 | 0.00 | ... | 0.79 | 0.00 | 0.0 | 0.36 | 0 | 1 | 6 | 0 | 4 | 1 |
5 rows × 21 columns
from sklearn.model_selection import train_test_split
X = df_encode.drop('类型', axis=1)
y = df_encode['类型']X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
不平衡数据处理
d12['类型'].value_counts()
类型
铅钡 49
高钾 18
Name: count, dtype: int64
df_encode['类型'].value_counts()
类型
0 49
1 18
Name: count, dtype: int64
from imblearn.over_sampling import SMOTEoversample = SMOTE()
X_train_smote, y_train_smote = oversample.fit_resample(X_train, y_train)
y_train_smote.value_counts()
类型
0 37
1 37
Name: count, dtype: int64
分类模型
决策树分类
模型评估:https://www.statology.org/sklearn-classification-report/
# Load libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()
# Train Decision Tree Classifer
clf = clf.fit(X_train_smote, y_train_smote)
#Predict the response for test dataset
y_pred = clf.predict(X_test)print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0precision recall f1-score support0 1.00 1.00 1.00 121 1.00 1.00 1.00 2accuracy 1.00 14macro avg 1.00 1.00 1.00 14
weighted avg 1.00 1.00 1.00 14
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplaycf_mat = confusion_matrix(y_test, y_pred)
print('混淆矩阵')
print(cf_mat)disp = ConfusionMatrixDisplay(confusion_matrix=cf_mat,display_labels=clf.classes_)
disp.plot()
plt.show()
混淆矩阵
[[12 0][ 0 2]]
决策树可视化
https://mljar.com/blog/visualize-decision-tree/
from sklearn import tree
text_representation = tree.export_text(clf)
print(text_representation)
|--- feature_9 <= 5.46
| |--- class: 1
|--- feature_9 > 5.46
| |--- class: 0
fig = plt.figure(figsize=(25,20))
my_plot = tree.plot_tree(clf, feature_names=list(X.columns), class_names=['高钾','铅钡'],filled=True)
#define metrics
y_pred_proba = clf.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
fig = plt.figure(figsize=(10,8))#create ROC curve
plt.plot(fpr,tpr,label="AUC="+str(auc))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.rc('font', size=20)
plt.rc('figure', titlesize=20)
plt.show()
随机森林分类
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn import tree
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_splitrf = RandomForestClassifier(n_estimators=500,random_state=0)
rf.fit(X_train_smote, y_train_smote)#Predict the response for test dataset
y_pred = clf.predict(X_test)print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0precision recall f1-score support0 1.00 1.00 1.00 121 1.00 1.00 1.00 2accuracy 1.00 14macro avg 1.00 1.00 1.00 14
weighted avg 1.00 1.00 1.00 14
fn = list(X_train_smote.columns)
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=800)
my_plot = tree.plot_tree(rf.estimators_[200],feature_names = fn, class_names=['高钾','铅钡'],filled = True)
XGBoost分类
import xgboost as xgb# Use "hist" for constructing the trees, with early stopping enabled.
xgb = xgb.XGBClassifier(tree_method="hist", early_stopping_rounds=2)
# Fit the model, test sets are used for early stopping.
xgb.fit(X_train_smote, y_train_smote)#Predict the response for test dataset
y_pred = xgb.predict(X_test)print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0precision recall f1-score support0 1.00 1.00 1.00 121 1.00 1.00 1.00 2accuracy 1.00 14macro avg 1.00 1.00 1.00 14
weighted avg 1.00 1.00 1.00 14
# plot single tree
#plot_tree(xgb)
#plt.show()
LightGBM分类
https://machinelearningmastery.com/gradient-boosting-with-scikit-learn-xgboost-lightgbm-and-catboost/
from lightgbm import LGBMClassifiergbm = LGBMClassifier()
gbm.fit(X_train_smote, y_train_smote)
[LightGBM] [Info] Number of positive: 37, number of negative: 37
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003718 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 292
[LightGBM] [Info] Number of data points in the train set: 74, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -infLGBMClassifier()
#Predict the response for test dataset
y_pred = gbm.predict(X_test)print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0precision recall f1-score support0 1.00 1.00 1.00 121 1.00 1.00 1.00 2accuracy 1.00 14macro avg 1.00 1.00 1.00 14
weighted avg 1.00 1.00 1.00 14
Catboost分类
from catboost import CatBoostClassifier
cat = CatBoostClassifier(verbose=0, n_estimators=100)
cat.fit(X_train_smote, y_train_smote)#Predict the response for test dataset
y_pred = cat.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0precision recall f1-score support0 1.00 1.00 1.00 121 1.00 1.00 1.00 2accuracy 1.00 14macro avg 1.00 1.00 1.00 14
weighted avg 1.00 1.00 1.00 14
基于直方图的梯度提升Histogram-Based Gradient Boosting
from sklearn.ensemble import HistGradientBoostingClassifierhbg = HistGradientBoostingClassifier()
hbg.fit(X_train_smote, y_train_smote)#Predict the response for test dataset
y_pred = hbg.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0precision recall f1-score support0 1.00 1.00 1.00 121 1.00 1.00 1.00 2accuracy 1.00 14macro avg 1.00 1.00 1.00 14
weighted avg 1.00 1.00 1.00 14
梯度提升树Gradient Boosting Tree
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
gb.fit(X_train_smote, y_train_smote)y_pred = gb.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0precision recall f1-score support0 1.00 1.00 1.00 121 1.00 1.00 1.00 2accuracy 1.00 14macro avg 1.00 1.00 1.00 14
weighted avg 1.00 1.00 1.00 14
逻辑回归Logistic
from sklearn import linear_model
import numpylogr = linear_model.LogisticRegression()
logr.fit(X_train_smote, y_train_smote)y_pred = gb.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0precision recall f1-score support0 1.00 1.00 1.00 121 1.00 1.00 1.00 2accuracy 1.00 14macro avg 1.00 1.00 1.00 14
weighted avg 1.00 1.00 1.00 14
print('模型的回归系数:')
log_odds = logr.coef_
odds = numpy.exp(log_odds)
odds
模型的回归系数:array([[0.82909358, 1.21679202, 1.00751889, 1.15839779, 1.06619743,0.99198439, 0.96850246, 0.99970948, 1.01100367, 0.75457187,0.91357586, 0.99337601, 0.99650844, 1.00080449, 1.00000987,1.020977 , 1.02869626, 0.99088894, 0.8292335 , 0.98475949]])
朴素贝叶斯Naive Bayes
from sklearn.naive_bayes import GaussianNBgnb = GaussianNB()
y_pred = gnb.fit(X_train_smote, y_train_smote)y_pred = gnb.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0precision recall f1-score support0 1.00 1.00 1.00 121 1.00 1.00 1.00 2accuracy 1.00 14macro avg 1.00 1.00 1.00 14
weighted avg 1.00 1.00 1.00 14
支持向量机SVM
from sklearn import svm
svm = svm.SVC()
svm.fit(X_train_smote, y_train_smote)y_pred = svm.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0precision recall f1-score support0 1.00 1.00 1.00 121 1.00 1.00 1.00 2accuracy 1.00 14macro avg 1.00 1.00 1.00 14
weighted avg 1.00 1.00 1.00 14
神经网络Neural network
from sklearn.neural_network import MLPClassifiernn = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
nn.fit(X_train_smote, y_train_smote)y_pred = nn.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
Accuracy: 1.0precision recall f1-score support0 1.00 1.00 1.00 121 1.00 1.00 1.00 2accuracy 1.00 14macro avg 1.00 1.00 1.00 14
weighted avg 1.00 1.00 1.00 14
相关阅读:
- 2022高教社杯全国大学生数学建模竞赛C题 问题二(2) Python代码
- 2023高教社杯全国大学生数学建模竞赛C题 Python代码
- 2021高教社杯全国大学生数学建模竞赛C题 问题一&问题二 Python代码