boston_housing

使用nn对Boston housing数据集进行分类

#Let's get rid of some imports
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
#Define the model 
import torch
import torch.nn as nn
import torch.nn.functional as F

实验流程

获取数据
分析数据
建模
模型评估
模型储存
模型对比

获取数据

直接使用提供的boston_housing.txt文件
不允许直接修改txt文件

from io import StringIO
import pandas as pd

def read_text():
    with open("boston_housing.txt") as f:
        r_list = [i.replace("\n", "") for i in f.readlines()]

    des_part = r_list[7:21]
    data_part = r_list[22:]

    des_part = [s.split()[0].strip() for s in des_part]
    data_part = [
        data_part[i] + data_part[i + 1] for i in range(0, len(data_part), 2)
    ]
    df = pd.read_csv(StringIO("\n".join(data_part)),
                         sep="\s+",
                         header=None,
                         names=des_part)

#     print(df.head())
    return df

boston_df = read_text()
boston_df.head()

	CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	MEDV
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1	296.0	15.3	396.90	4.98	24.0
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2	242.0	17.8	396.90	9.14	21.6
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2	242.0	17.8	392.83	4.03	34.7
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3	222.0	18.7	394.63	2.94	33.4
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3	222.0	18.7	396.90	5.33	36.2

属性意义:

CHAS 和 RAD 应该是离散量

- CRIM     per capita crime rate by town
- ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
- INDUS    proportion of non-retail business acres per town
- CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
- NOX      nitric oxides concentration (parts per 10 million)
- RM       average number of rooms per dwelling
- AGE      proportion of owner-occupied units built prior to 1940
- DIS      weighted distances to five Boston employment centres
- RAD      index of accessibility to radial highways
- TAX      full-value property-tax rate per 10,000
- PTRATIO  pupil-teacher ratio by town
- B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
- LSTAT    % lower status of the population
- MEDV     Median value of owner-occupied homes in 1000's

接下来检查数据特征和质量

1	boston_df.describe()

	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	MEDV
count	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000
mean	3.613524	11.363636	11.136779	0.069170	0.554695	6.284634	68.574901	3.795043	9.549407	408.237154	18.455534	356.674032	12.653063	22.532806
std	8.601545	23.322453	6.860353	0.253994	0.115878	0.702617	28.148861	2.105710	8.707259	168.537116	2.164946	91.294864	7.141062	9.197104
min	0.006320	0.000000	0.460000	0.000000	0.385000	3.561000	2.900000	1.129600	1.000000	187.000000	12.600000	0.320000	1.730000	5.000000
25%	0.082045	0.000000	5.190000	0.000000	0.449000	5.885500	45.025000	2.100175	4.000000	279.000000	17.400000	375.377500	6.950000	17.025000
50%	0.256510	0.000000	9.690000	0.000000	0.538000	6.208500	77.500000	3.207450	5.000000	330.000000	19.050000	391.440000	11.360000	21.200000
75%	3.677082	12.500000	18.100000	0.000000	0.624000	6.623500	94.075000	5.188425	24.000000	666.000000	20.200000	396.225000	16.955000	25.000000
max	88.976200	100.000000	27.740000	1.000000	0.871000	8.780000	100.000000	12.126500	24.000000	711.000000	22.000000	396.900000	37.970000	50.000000

1
2
3

import numpy as np
#check for missing values
print(np.sum(np.isnan(boston_df)))

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

分析数据

首先，让我们关注因变量，大部分情况下存在正态分布，其中一些位于分布的顶端，我们稍后将进行探讨。
然后关注数据集中的相关系数分布

#Let's us seaborn, because it is pretty. ;) 
#See more here. http://seaborn.pydata.org/tutorial/distributions.html
import seaborn as sns
sns.displot(boston_df['MEDV']);

1 2	corr = boston_df.corr() corr.style.background_gradient(cmap='coolwarm')

	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	MEDV
CRIM	1.000000	-0.200469	0.406583	-0.055892	0.420972	-0.219247	0.352734	-0.379670	0.625505	0.582764	0.289946	-0.385064	0.455621	-0.388305
ZN	-0.200469	1.000000	-0.533828	-0.042697	-0.516604	0.311991	-0.569537	0.664408	-0.311948	-0.314563	-0.391679	0.175520	-0.412995	0.360445
INDUS	0.406583	-0.533828	1.000000	0.062938	0.763651	-0.391676	0.644779	-0.708027	0.595129	0.720760	0.383248	-0.356977	0.603800	-0.483725
CHAS	-0.055892	-0.042697	0.062938	1.000000	0.091203	0.091251	0.086518	-0.099176	-0.007368	-0.035587	-0.121515	0.048788	-0.053929	0.175260
NOX	0.420972	-0.516604	0.763651	0.091203	1.000000	-0.302188	0.731470	-0.769230	0.611441	0.668023	0.188933	-0.380051	0.590879	-0.427321
RM	-0.219247	0.311991	-0.391676	0.091251	-0.302188	1.000000	-0.240265	0.205246	-0.209847	-0.292048	-0.355501	0.128069	-0.613808	0.695360
AGE	0.352734	-0.569537	0.644779	0.086518	0.731470	-0.240265	1.000000	-0.747881	0.456022	0.506456	0.261515	-0.273534	0.602339	-0.376955
DIS	-0.379670	0.664408	-0.708027	-0.099176	-0.769230	0.205246	-0.747881	1.000000	-0.494588	-0.534432	-0.232471	0.291512	-0.496996	0.249929
RAD	0.625505	-0.311948	0.595129	-0.007368	0.611441	-0.209847	0.456022	-0.494588	1.000000	0.910228	0.464741	-0.444413	0.488676	-0.381626
TAX	0.582764	-0.314563	0.720760	-0.035587	0.668023	-0.292048	0.506456	-0.534432	0.910228	1.000000	0.460853	-0.441808	0.543993	-0.468536
PTRATIO	0.289946	-0.391679	0.383248	-0.121515	0.188933	-0.355501	0.261515	-0.232471	0.464741	0.460853	1.000000	-0.177383	0.374044	-0.507787
B	-0.385064	0.175520	-0.356977	0.048788	-0.380051	0.128069	-0.273534	0.291512	-0.444413	-0.441808	-0.177383	1.000000	-0.366087	0.333461
LSTAT	0.455621	-0.412995	0.603800	-0.053929	0.590879	-0.613808	0.602339	-0.496996	0.488676	0.543993	0.374044	-0.366087	1.000000	-0.737663
MEDV	-0.388305	0.360445	-0.483725	0.175260	-0.427321	0.695360	-0.376955	0.249929	-0.381626	-0.468536	-0.507787	0.333461	-0.737663	1.000000

分割数据集

一般将 y 设置为目标变量 and X(大写表示矩阵) 设置为自变量.
使用 train_test_split 分割训练和测试数据集.
将原来的数据集分为四份: X_train, X_test, y_train, y_test
并且这个分割是随机抽取的, 随机种子由random_state设置来保证可以复现

def get_tag_columns(df, limit=10):
    '''find cols contains continuous data'''
    ret = []
    for col in df.columns:
        if df[col].nunique() < limit:
            ret.append(col)
    return ret

from sklearn.preprocessing import StandardScaler, OneHotEncoder

def preprocess(df):
    target_label = "MEDV"
    y = df[target_label].values
    df = df.drop(target_label, axis=1)

    # one hot
    tags = get_tag_columns(df)
    tags_df = df[tags]

    one_hot_df = pd.get_dummies(df[tags], columns=tags)
#     print(one_hot_df.head())

    # Standard
    ss = StandardScaler()
    X = np.concatenate([ss.fit_transform(one_hot_df), df.drop(tags, axis=1).values], axis=1)
    return X.astype("float32"), y.astype("float32")

#This will throw and error at import if haven't upgraded. 
# from sklearn.cross_validation  import train_test_split  
from sklearn.model_selection  import train_test_split
#y is the dependent variable.
# y = boston_df['MEDV'].values
# # As we know, iloc is used to slice the array by index number. Here this is the matrix of 
# # independent variables.
# X = boston_df.iloc[:,0:-1].values

X, y = preprocess(boston_df)

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(354, 22) (152, 22) (354,) (152,)

建模

接下来要设置一些训练用的超参数
并且定义神经网络的结构

#Define training hyperprameters.
batch_size = 50
test_batch_size = 50
num_epochs = 500
learning_rate = 0.01
size_hidden= 100

#Calculate some other hyperparameters based on data.  
batch_no = len(X_train) // batch_size  #batches
cols=X_train.shape[1] #Number of columns in input matrix
n_output=1

#Create the model
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Assume that we are on a CUDA machine, then this should print a CUDA device:
print("Executing the model on :",device)
class Net(torch.nn.Module):
    def __init__(self, n_feature, size_hidden, n_output):
        super(Net, self).__init__()
        self.hidden = torch.nn.Linear(cols, size_hidden)   # hidden layer
        self.predict = torch.nn.Linear(size_hidden, n_output)   # output layer

    def forward(self, x):
        x = F.relu(self.hidden(x))      # activation function for hidden layer
        x = self.predict(x)             # linear output
        return x
net = Net(cols, size_hidden, n_output)

Executing the model on : cuda:0

#Adam is a specific flavor of gradient decent which is typically better
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
#optimizer = torch.optim.SGD(net.parameters(), lr=0.2)
criterion = torch.nn.MSELoss(reduction='sum')  # this is for regression mean squared loss

from torch.utils.data import TensorDataset, DataLoader
train_loader = DataLoader(
    TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32)),
    batch_size=batch_size)

X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

test_loader = DataLoader(
    TensorDataset(X_test_tensor, y_test_tensor),
    batch_size=test_batch_size)

train_loss = []
train_r2 = []
test_loss =[]
test_r2_x = []
test_r2 = []

import pandas as pd
from sklearn.metrics import r2_score

running_loss = 0.0
for epoch in range(num_epochs):
    #Shuffle just mixes up the dataset between epocs
#     X_train, y_train = shuffle(X_train, y_train)
    # Mini batch learning
    net.train()
    for inputs,labels in train_loader:
#         start = i * batch_size
#         end = start + batch_size
#         inputs = Variable(torch.FloatTensor(X_train[start:end]))
#         labels = Variable(torch.FloatTensor(y_train[start:end]))
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        #print("outputs",outputs)
        #print("outputs",outputs,outputs.shape,"labels",labels, labels.shape)
        loss = criterion(outputs, torch.unsqueeze(labels,dim=1))
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
    if epoch % 20 == 0 or epoch == num_epochs - 1:
        with torch.no_grad():
            net.eval()
            result = net(X_test_tensor)
            pred=result.data[:,0].numpy()
#             print(len(pred),len(y_train))
            r2_test = r2_score(pred,y_test_tensor)
            test_r2_x.append(epoch)
            test_r2.append(r2_test)
        print('Epoch {:5}, loss {:10}, r2: {:3}'.format(epoch+1,running_loss, r2_test))
    train_loss.append(running_loss)
    running_loss = 0.0

Epoch     1, loss 183307.15435791016, r2: -2.598610349628883
Epoch    21, loss 12408.028907775879, r2: 0.012998142345433839
Epoch    41, loss 9915.79284286499, r2: 0.3333615213725519
Epoch    61, loss 8882.879081726074, r2: 0.348562519228811
Epoch    81, loss 6280.446783065796, r2: 0.49985757136694375
Epoch   101, loss 5083.443628311157, r2: 0.47137647771965885
Epoch   121, loss 4280.029432296753, r2: 0.5414219095454249
Epoch   141, loss 3558.088544845581, r2: 0.6027891438869899
Epoch   161, loss 3337.6654014587402, r2: 0.6037971240580867
Epoch   181, loss 3318.3874855041504, r2: 0.5410183626548175
Epoch   201, loss 2860.4403324127197, r2: 0.5880700361113333
Epoch   221, loss 4202.569885253906, r2: 0.24101134042952954
Epoch   241, loss 2625.922472000122, r2: 0.6117615302232601
Epoch   261, loss 3141.6027069091797, r2: 0.518363446529775
Epoch   281, loss 3209.169864654541, r2: 0.699972316605045
Epoch   301, loss 2891.8109397888184, r2: 0.4471336384834912
Epoch   321, loss 2454.4431190490723, r2: 0.680289615010776
Epoch   341, loss 2721.572551727295, r2: 0.6537569197620496
Epoch   361, loss 2374.3646926879883, r2: 0.6716516044749056
Epoch   381, loss 3353.4731216430664, r2: 0.650187875665645
Epoch   401, loss 3029.42342376709, r2: 0.6688595181218332
Epoch   421, loss 6365.018569946289, r2: 0.5285317058425256
Epoch   441, loss 3570.9040994644165, r2: 0.6478276332416046
Epoch   461, loss 5335.774890899658, r2: 0.575522473404229
Epoch   481, loss 4620.325346946716, r2: 0.6434875354238424
Epoch   500, loss 3078.047025680542, r2: 0.6554469434865164

import pandas as pd
from sklearn.metrics import r2_score

X = torch.FloatTensor(X_train)
result = net(X)
pred=result.data[:,0].numpy()
print(len(pred),len(y_train))
r2_score(pred,y_train)

  354 354
  0.9136254737114893

plt.figure()
train_loss_np = np.array(train_loss)
plt.plot(range(len(train_loss)), train_loss_np/train_loss_np.max(),"r", label="train")
plt.plot(test_r2_x, np.maximum(np.array(test_r2), 0), "g", label="test")
plt.legend()

<matplotlib.legend.Legend at 0x7fe2e488b640>

训练结果分析

500个epochs之后
测试集r2为0.72, 训练集r2为0.90
残差依然在下降, 但是测试集的r2在150 epochs之后上升已不明显

1 2	# save model torch.save(net.state_dict(), "dnn.pt")

模型对比

首先引入线性回归模型类 from sklearn.linear_model import LinearRegression
创建模型.
利用训练集拟合数据.
获得训练后的模型.

1
2
3

from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit( X_train, y_train )

LinearRegression()

1 2	print('R2 for Train', lm.score( X_train, y_train )) print('R2 for Test (cross validation)', lm.score(X_test, y_test))

R2 for Train 0.7732977014498186
R2 for Test (cross validation) 0.6789738823760003

# GBDT model
from sklearn.ensemble import GradientBoostingRegressor

reg = GradientBoostingRegressor(random_state=0)
reg.fit(X_train, y_train)

print('R2 for Train)', reg.score( X_train, y_train ))
print('R2 for Test (cross validation)', reg.score(X_test, y_test))

R2 for Train) 0.9841732181245905
R2 for Test (cross validation) 0.8405851379007085

对比结论

线性回归模型略逊于双层神经网络, 但是更稳定可信, 计算更快速
GBDT模型在这个问题上全面优于神经网络