JohnLyu的blog

橙汁事务所艾欧泽亚分部

0%

svm简易分类器

svm_covtype.py:

建议拿到jupyter里跑

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
"""
dataset from https://archive.ics.uci.edu/ml/datasets/Covertype
target: create a svm two-class classifier

@author John Lyu
@date 10/6/2020
"""
import pandas as pd
import numpy as np
import re
import pickle
import os

from IPython.display import display

from sklearn.model_selection import train_test_split

# create colnames
column_descriptions = """Elevation
Aspect
Slope
Horizontal_Distance_To_Hydrology
Vertical_Distance_To_Hydrology
Horizontal_Distance_To_Roadways
Hillshade_9am
Hillshade_Noon
Hillshade_3pm
Horizontal_Distance_To_Fire_Points
Wilderness_Area (4 binary columns)
Soil_Type (40 binary columns)
Cover_Type (7 types) """.splitlines()

col_names = []
one_hot_pattern = re.compile(r"(.+?)\((\d+).+columns\)")
for cd in column_descriptions:
m = one_hot_pattern.search(cd)
if m:
for i in range(int(m.group(2))):
col_names.append(f"{m.group(1).strip()}_{i}")
else:
col_names.append(cd.strip())
display(col_names)

raw_df = pd.read_csv("./covtype.data", names=col_names).fillna(0)

display(raw_df.head())

def get_float_columns(df, limit=30):
'''find cols contains continuous data'''
ret = []
for col in df.columns:
if df[col].nunique() > limit:
ret.append(col)
return ret

# 连续量标准化
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

norm_df = raw_df.copy()
fcols = get_float_columns(norm_df)
norm_df[fcols] = ss.fit_transform(norm_df[fcols])

display(norm_df.head())

# 查看样本分布
norm_df[norm_df.columns[-1]].value_counts()

from sklearn import preprocessing
from sklearn.svm import SVC

X, y = raw_df.iloc[:, :-1].values, raw_df.iloc[:, -1].values # 切割最后一列为target
change_result = lambda x: 1 if x == 2 else 0 # 将target分为二分问题
y = np.vectorize(change_result)(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) #全部数据的30%作为测试集

display(pd.value_counts(y_train))

svm_model = SVC(class_weight='balanced',kernel='linear') # 'rbf' 太慢
svm_model.fit(X_train, y_train) # 非常耗时!建议睡前跑

# 学习完了赶紧把模型存起来
with open('./model.pickle', 'wb') as f:
pickle.dump(svm_model, f)


# 读取svm model
with open('./model.pickle', 'rb') as f:
svm_model = pickle.load(f)

# 验证训练集的效果
y_t = svm_model.predict(X_train)
acc = np.sum(y_t == y_train)/ y_train.size
print(f"train acc is: {acc}")

# 验证测试集
y_pred = svm_model.predict(X_test)
# display(y_pred)
acc = np.sum(y_test == y_pred)/ y_test.size
print(f"test acc is: {acc}")

# 查看支撑向量
display(svm_model.n_support_)
display(svm_model.support_vectors_)