机器学习中可复用代码总结

def get_chunk_df(file, chunksize = 6000000, sample = 0.01 ):
    chunks = pd.read_csv(file, chunksize = chunksize)
    chunks_list = []
    i = 0
    for chunk in chunks:
        i+= 1
        print('正在处理第 %d'%(i),'个chunk......','其尺寸为',chunk.shape)
        chunks_list.append(chunk.sample(frac = sample, replace = False))
    chunk_df = pd.concat(chunks_list, axis = 0)
    del chunks_list
    print('采样后DataFrame尺寸为', chunk_df.shape)
    return chunk_df

file = '../data/train_20190518.csv'    
mini_train = get_chunk_df(file)
mini_train.to_csv('../data/mini_train.csv', index=False)
def get_same_uid_df(file, chunksize = 6000000):
    chunks = pd.read_csv(file, chunksize = chunksize, names=['label','uId','adId','operTime','siteId','slotId','contentId','netType'])
    chunks_list = []
    i = 0
    for chunk in chunks:
        i+= 1
        print('正在处理第 %d'%(i),'个chunk......','其尺寸为',chunk.shape)
        chunks_list.append(pd.merge(chunk, test_data[['uId']].drop_duplicates(),on='uId').drop_duplicates())
    chunk_df = pd.concat(chunks_list, axis = 0).drop_duplicates()
    del chunks_list
    print('采样后DataFrame尺寸为', chunk_df.shape)
    return chunk_df

test_data = pd.read_csv('../data/test_20190518.csv',names=['uId','adId','operTime','siteId','slotId','contentId','netType'])
file = '../data/train_20190518.csv'    
same_uid_train = get_same_uid_df(file)
same_uid_train.to_csv('../data/same_uid_train.csv', index=False)
def get_interval_data(df, start_time, end_time):  # 获取指定时间间隔的数据
    df['pay_time'] = pd.to_datetime(df['pay_time'])
    df = df[(df['pay_time'] =pd.to_datetime(start_time))  (df['pay_time'] = pd.to_datetime(end_time))]
    return df
def get_submit_csv(model, is_proba = ''):
    if  is_proba == 'True':
        y_pred = model.predict_proba(X_test)[:,1]
    else:
        y_pred = model.predict(X_test, num_iteration = lgb_clf.best_iteration)
    X_test['y_pred'] = np.around(y_pred, decimals=6) # 将预测结果转化为6位有效数字
    X_test.reset_index(level=0, inplace=True)
    submission = X_test[['index','y_pred']]
    submission.columns = ['id', 'probability']
    submission.to_csv('../submit/submission.csv', index=False)

get_submit_csv(lgb_clf)

 

 

快速查看数据结构:

import numpy as np
np.set_printoptions(suppress=True)
pd.set_option('display.float_format', lambda x: '%.2f' % x) #为了直观的显示数字,不采用科学计数法
import pandas as pd
df = pd.read_csv('./data/**.csv')
df.head() # 查看数据的前五行
df.info() # 快速获取数据集的简单描述,特别是总行数、每个属性的类型和非空值的数量
df['my_columns'].value_counts() # 查看某一列有多少种分类存在,每种类别下分别有多少个样本
df.describe() # 显示数值属性的摘要

%matplotlib inline
import matplotlib.pyplot as plt
df.hist(bins=50, figsize=(20, 15))
plt.show()

交叉验证
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=666)

best_k, best_p, best_score = 0, 0, 0
for k in range(2, 11):
    for p in range(1, 6):
        knn_clf = KNeighborsClassifier(weights="distance", n_neighbors=k, p=p)
        scores = cross_val_score(knn_clf, X_train, y_train, cv=5) # 会自动将数据分为训练集和验证集
        score = np.mean(scores)
        if score  best_score:
            best_k, best_p, best_score = k, p, score
网格搜索

以kNN Regressor为例使用scikit-learn进行网格搜索:

from sklearn.preprocessing import StandardScaler # 使用KNN进行预测时,需要对传入的数据进行归一化
standardScaler = StandardScaler() # 对均值标准差类 进行实例化
standardScaler.fit(X_train, y_train) # 使用训练数据 进行统计量的计算
X_train_standard = standardScaler.transform(X_train) # 对训练数据进行归一化
X_test_standard = standardScaler.transform(X_test) # 使用训练数据计算出来的统计数据 对测试特征数据进行归一化

from sklearn.neighbors import KNeighborsRegressor
knn_reg = KNeighborsRegressor() # 使用默认参数
knn_reg.fit(X_train_standard, y_train) # 传入归一化后的训练特征数据 和 对应的训练标签数据进行模型训练
knn_reg.score(X_test_standard, y_test) # 传入归一化后的测试特征数据 和 对应的真实测试标签数据进行评分

from sklearn.model_selection import GridSearchCV # 网格搜索交叉验证

param_grid = [
    {
        "weights": ["uniform"],
        "n_neighbors": [i for i in range(1, 11)]
    },
    {
        "weights": ["distance"],
        "n_neighbors": [i for i in range(1, 11)],
        "p": [i for i in range(1,6)]
    }
]

knn_reg = KNeighborsRegressor() # 默认参数
grid_search = GridSearchCV(knn_reg, param_grid, n_jobs=-1, verbose=1, cv=5) # verbose设置输出多少内容
grid_search.fit(X_train_standard, y_train)

grid_search.best_params_ # 模型最好参数

grid_search.best_score_ # 与之前的衡量指标不同,此处使用的是交叉验证的评分标准与回归模型中R方的评分方法不同

# 先返回一个回归模型,然后回归模型有评分方法使用的是R方
grid_search.best_estimator_.score(X_test_standard, y_test) # 基于网格搜索后得到的回归算法 在测试数据集上的预测,与之前的衡量指标相同

梯度的调试:

X_b = np.hstack([np.ones((len(X), 1)), X])

def dJ_debug(theta, X_b, y, epsilon=0.01):
    res = np.empty(len(theta)) # 声明一个与theta等长的向量
    for i in range(len(theta)):
        theta_1 = theta.copy()
        theta_1[i] += epsilon # 第i个维度的值
        theta_2 = theta.copy()
        theta_2[i] -= epsilon
        res[i] = (J(theta_1, X_b, y) - J(theta_2, X_b, y)) / (2 * epsilon)
    return res

def gradient_descent(dJ, X_b, y, initial_theta, eta, n_iters = 1e4, epsilon=1e-8): # 第一个参数为 如何求梯度的函数
    
    theta = initial_theta
    cur_iter = 0

    while cur_iter  n_iters:
        gradient = dJ(theta, X_b, y)
        last_theta = theta
        theta = theta - eta * gradient
        if(abs(J(theta, X_b, y) - J(last_theta, X_b, y))  epsilon):
            break
            
        cur_iter += 1

    return theta

使用多项式的方式进行特征的构造

数据预处理,为数据添加高次项

PolynomialFeatures: 

from sklearn.preprocessing import PolynomialFeatures # 数据预处理,为数据添加新的特征

poly = PolynomialFeatures(degree=2) # 为原来的数据添加最多几次幂的特征,从0开始计算,0 1 2
poly.fit(X) # 与 standardscaler 与 pca使用方法相同! 先计算高次项
X2 = poly.transform(X) # 将高次项与目标数据stack在一起

Pipeline

为模型添加多项式项

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

def PolynomialRegression(degree): # 将pipeline进行封装
    return Pipeline([
        ("poly", PolynomialFeatures(degree=degree)),
        ("std_scaler", StandardScaler()),
        ("lin_reg", LinearRegression())
    ])

poly_reg = PolynomialRegression(degree=2)
poly_reg.fit(X, y)
y_predict = poly2_reg.predict(X)
mean_squared_error(y, y_predict)

绘制学习曲线

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

def plot_learning_curve(algo, X_train, X_test, y_train, y_test): # 参数:所使用的模型(可以是一个类) 数据
    train_score = [] # 声明两个容器用于存放学习分数
    test_score = []
    for i in range(1, len(X_train)+1): # 从1开始,有多少个样本就循环多少次
        algo.fit(X_train[:i], y_train[:i]) # 使用前i个数据进行训练
    
        y_train_predict = algo.predict(X_train[:i]) # 使用前i个已经用于训练模型的数据 进行模型评估
        train_score.append(mean_squared_error(y_train[:i], y_train_predict))
    
        y_test_predict = algo.predict(X_test) # 使用未进行模型训练的测试数据进行模型评估
        test_score.append(mean_squared_error(y_test, y_test_predict))
        
    plt.plot([i for i in range(1, len(X_train)+1)], np.sqrt(train_score), label="train")  # 变为损失rmse后进行绘制
    plt.plot([i for i in range(1, len(X_train)+1)], np.sqrt(test_score), label="test")
    plt.legend() # 由于有标签所以需要 legend一下
    plt.axis([0, len(X_train)+1, 0, 4])
    plt.show() # 使用不同数量的训练数据或者测试数据进行模型评估 绘制得到的曲线
    
plot_learning_curve(LinearRegression(), X_train, X_test, y_train, y_test)

管道与学习曲线结合快速实现多个模型的训练

from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

def PolynomialRegression(degree):
    return Pipeline([
        ("poly", PolynomialFeatures(degree=degree)), # 为数据生成高阶项
        ("std_scaler", StandardScaler()), # 生成高阶项后 数据分布差别较大,对数据进行标准归一化
        ("lin_reg", LinearRegression()) # 模型训练
    ])

poly2_reg = PolynomialRegression(degree=2) # 进行管道实例化,一个管道内有三个类 数据流可以依次通过 
plot_learning_curve(poly2_reg, X_train, X_test, y_train, y_test) # 将实例化后的管道和数据传入学习曲线函数

绘制决策边界

import numpy as np
import matplotlib.pyplot as plt

def plot_decision_boundary(model, axis): # 传入训练好的模型,绘制坐标轴范围
    
    x0, x1 = np.meshgrid(
        np.linspace(axis[0], axis[1], int((axis[1]-axis[0])*100)).reshape(-1, 1),
        np.linspace(axis[2], axis[3], int((axis[3]-axis[2])*100)).reshape(-1, 1),
    )
    X_new = np.c_[x0.ravel(), x1.ravel()]

    y_predict = model.predict(X_new)
    zz = y_predict.reshape(x0.shape)

    from matplotlib.colors import ListedColormap
    custom_cmap = ListedColormap(['#EF9A9A','#FFF59D','#90CAF9']) # 三个颜色
    
    plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)
    
plot_decision_boundary(log_reg, axis=[4, 7.5, 1.5, 4.5])

Soft Voting Classifier 

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(estimators=[
    ('log_clf', LogisticRegression()), 
    ('svm_clf', SVC(probability=True)),
    ('dt_clf', DecisionTreeClassifier(random_state=666))],
                             voting='soft')

voting_clf.fit(X_train, y_train)
voting_clf.score(X_test, y_test)

 

最新回复(0)
/jishuOOjqcgDQcvbgkrs_2BOH3x1q_2FoPGasoljdsKEehA_3D_3D4795372
8 简首页