人工智能基础

人工智能基础

python基础教程

输入输出

1
2
3
4
5
6
7
8
print("hello world")
print("hello","world")
print("hello,i'm %s,%d years old!"%('tom',20)) # 字符串 整数
print("%2d"%3) # 整数
print("%02d"%3) # 两位整数
print("%.2f"%3.1415926)
print("%.2f%%"%25) # 百分数
str = "i'm {0},{1} years old".format("tom",20) # 按索引

元组

1
2
3
4
5
students = ("jim","jack")
for s in students:
print(s)
len(students)
students[1],students[-1],students[10]

元组不可被修改

List

1
2
3
4
5
6
7
8
students = ["jim","jack"]
students.append("rose") # 添加
students.insert(2,"lee") # 按位置插入
# students.pop() # 弹出
# students.pop(2)
students.sort() # 排序
for s in students:
print(s)

Set

1
2
3
4
5
6
7
8
9
set1 = {1,3,5,7,9}
set2 = {1,3,5,7,9,3,7,9}
set1.add(11),set1.remove(5)
s1 = {1,3,5,7}
s2 = {5,7,9,11}
s3 = s1 & s2
s4 = s1 | s2
s4 = set([1,3,5,7])
s5 = set(list(range(1,10)))

set不可重复

Dict

1
2
3
4
5
6
7
8
9
10
11
12
d = {"s01":"jim","s02":"tom","s03":"jack"}
d["s05"] = "rose"
d["s02"]
d.get("s01")
if "s05" in d:
d.pop("s05")

for k in d:
print("%s->%s"%(k,d.get(k)))

for v in d.values():
print("%s->%s"%(k,v))

列表生成式

1
2
3
4
5
6
7
8
9
my_list = list(range(1,10,2))
my_list = [x * x for x in range(1,10,2)]
print(my_list)
my_list = [x * y for x in range(1,10) for y in range(1,10)]
print(my_list)
my_list = [x * x for x in range(1,21) if x % 3 == 0]
print(my_list)
my_list = [m + n.upper() for m in "hello" for n in "world"]
print(my_list)

Slice(切片)

1
2
3
4
5
6
7
8
9
10
11
12
my_list = list(range(20)) 
my_list[0:5],my_list[5:10],my_list[:10],my_list[10:]
my_list[-5:-1],my_list[-10:],my_list[:-10]
my_list[1:10:3],my_list[:],my_list[::3]

my_tuple = tuple(list(range(20))) # 元组也可以使用切片操作
my_tuple[3:-10:2]

my_str = "hello world" # 字符串可以看成一个list
my_str[3:5]
my_str[3:-2]
my_str[3:-2:2]

generator(生成器)

1
2
3
4
5
6
7
8
9
g = (x for x in range(1,11))
def fib(max_num):
i,j = 1,1
yield i
while j<max_num:
yield j
i,j = j,i+j
for num in fib(10):
print(num)

numpy入门

基础知识

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# 初始化nparray
my_array = np.array([1,2,3,4,5]) # list初始化
np.arange(10) # 自动生成
np.linspace(1,19,10,endpoint=False) # 线性函数
np.zeros(20,np.int) # 0
np.ones(20,np.int) # 1
np.random.randn(10) # 随机数
a = np.arange(1,10,1) # 间隔
b = np.arange(24).reshape(2,3,4) # 向量变为矩阵
b[1,1,3],b[0,1,:],b[0,2] # 访问矩阵
c = np.arange(1,20,1)
c[c>=15] # 条件过滤
c[~(c>=15)]
c[(c>=5)&(c<=15)]

mat1 = [[1,2,3],[4,5,6]]
np.mat(mat1) # 初始化矩阵
np.mat(mat1)*8 # 矩阵运算
mat1*mat2 # 乘法
mat2.T # 转秩矩阵
np.mat(np.zeros((3,3))) # 3*3 0矩阵
mat6 = np.mat(np.eye(2,2,dtype=int)) # 对称矩阵
a1 = [1,2,3]
a2 = np.mat(np.diag(a1)) # 对角矩阵

项目实战

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import numpy as np
import pandas as pd
data = pd.read_csv('/home/admin/jupyter/download/ai/python/data/1/test.csv')
c = data['摄氏度']
v = data['销量']

vmap = np.average(c,weights = v) # 加权平均数
mean = np.mean(c) # 平均数

# 时间加权平均价格,TWAP,属于另一种平均价格的指标,近期的价格给以较高的权重
t = np.arange(len(c))
twap = np.average(c,weights=t)

# 最大值和最小值
# 最高价的最大值和最低价的最小值
h,l = np.loadtxt('data.csv',delimiter=',',usecols=(4,5),unpack=True)

highest = np.max(h)
lowest = np.min(l)

#ptp函数可以计算数组的取值范围,返回数组元素的最大值和最小值之间的差值
spread_high_price = np.ptp(h)
spread_low_price = np.ptp(l)

#简单统计分析
#中位数
median = np.median(c)

#方差
variance = np.var(c)

#计算简单的收益率
#简单收益率指的是相邻两个价格的变化率,对数收益率,指的是所有价格取对数后两两之间的差值
c = np.loadtxt('data.csv',delimiter=',',usecols=(6,),unpack=True)
returns = np.diff(c)/c[:-1]

#用std函数计算标准差
std_deviation = np.std(returns)

#计算对数收益率
logreturns = np.diff(np.log(c))

#波动率
annual_volatility = np.std(logreturns)/np.mean(logreturns)
annual_volatility = annual_volatility/np.sqrt(1./252.)

Pandas数据处理

Pandas入门

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
from pandas import Series,DataFrame
import pandas as pd
# 列表创建
s1 = pd.Series([157,80,167,90])
s2 = pd.Series([100,78,65,77],index = ["chinese","english","history","maths"] )
# dict创建
d1 = {"name":"张三","Gender":"男","age":20,"height":180,"weight":66}
s3 = pd.Series(d1)
# dataframe
dfPerson = {
"name":["tom","jack","kitty","eric"],
"age":[20,19,21,22],
"height":[180,178,170,182],
"weight":[66,65,52,75]
}
personSheet = pd.DataFrame(dfPerson)
personSheet.head()
# numpy创建dataframe
import numpy as np
numframe = np.random.randn(10,5)
numSheet = pd.DataFrame(numframe)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# 1.1 常见操作
shape()可以获取excel文件的行和列,以元组形式返回;

info()获取数据类型;

isnull()判断哪个值是缺失值;

head()传入的参数代表获取前几行;

describe()掌握数值的分布情况,如均值,最值,方差,分位数;

drop()是否将原索引删掉,设置参数inplace来确认是否修改原数据表;

dropna()删除有缺失值的行,返回删除后的数据,传入参数how=all,要全为空值才会删除;

fillna()括号内可直接填入要填充的值,也可指定列填充,以字典形式传参;

# 2.2 排序
dataSort = pd.Series(range(5),index=['b','a','e','c','d'])
dataSort.sort_index()

# 直接写入参数test_dict
test_dict = {'id':[1,2,3,4,5,6],'name':['Alice','Bob','Cindy','Eric','Helen','Grace '],'math':[90,89,99,78,97,93],'english':[89,94,80,94,94,90]}
personInfo = pd.DataFrame(test_dict)

# 合并dataframe
personInfo2 = pd.DataFrame({'Gender':{0:'男',1:'男',2:'男',3:'男',4:'男',5:'男'}})
personInfo = pd.concat([personInfo,personInfo2],axis=1)

项目实战

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import numpy as np
import pandas as pd
data = pd.read_csv('/home/admin/jupyter/download/tianchi_learn_project/data/1/titanic_train.csv')
data.head()
data.loc[5:10,['Name','Sex','Survived']] # 取5-10行的数据
data.Survived.value_counts() # survived数值分布
data.Pclass.value_counts()
data.isnull().sum() # 空行数

data.groupby(['Sex','Survived'])['Survived'].count() # 按性别分组,并统计存储存活数

pd.crosstab(data.Pclass,data.Survived,margins=True) # 交叉列表取值
pd.crosstab([data.Embarked,data.Pclass],[data.Sex,data.Survived],margins=True)

data.isnull().sum()
data['Age'] = data['Age'].fillna(data['Age'].mean()) # 填充空值

Matplotlib数据可视化分析

基础知识

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import numpy as np
import matplotlib.pyplot as plt
# 散点图
x1 = np.array([6])
y1 = np.array([4])
plt.scatter(x1,y1)
plt.show
# 线图
x2 = np.array([2,6])
y2 = np.array([2,4])
plt.plot(x2,y2)
plt.show
# 两张画布
plt.figure()
plt.scatter(x1,y1,color="blue")
plt.figure()
plt.plot(x2,y2,color="red",linestyle='--')
# 横纵坐标范围
plt.xlim(0,8)
plt.ylim(0,5)
# 制定横纵坐标的刻度,并替换
x_ticks = [1,2,3,5,6]
plt.xticks(x_ticks)
plt.yticks([1,2,3,5,6],['one','two','three','four','five'])
# 横纵坐标的标记
plt.xlabel("i am x",fontsize=15)
plt.ylabel("i am y",fontsize=15)
# 图例
plt.legend(loc='lower right')
# 子图(用于在一个画图上,画多个图)
plt.subplot(2,2,1) # 2行1列,位置第一个

画图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# 散点图
n = 1024
x2 = np.random.normal(0,1,n)
y2 = np.random.normal(0,1,n)
plt.scatter(x2,y2,s=80,alpha=0.5)
plt.show()
# 折线图
x3_1 = np.array(['2014','2015','2016','2017','2018'])
y3_1 = np.array([12312,12321,45345,54565,23432])
plt.title("visitors",fontsize=15)
plt.xlabel("year",fontsize=15)
plt.ylabel("num(10 thousand)",fontsize=15)
plt.plot(x3_1,y3_1)
plt.scatter(x3_1,y3_1)
plt.show()
# 柱形图
x4 = ['2014','2015','2016','2017','2018']
y4 = [200,300,150,350,800]
rects = plt.bar(x4,height=y4,width=0.4,alpha=0.8,color='g')
plt.bar(x4,height=y4,width=0.4,alpha=0.8,color='g')
plt.ylim(0,900)
plt.ylabel('sale(10 thousand)',fontsize=15)
plt.xlabel('year',fontsize=15)
plt.title('income',fontsize=15)

# 标出数字
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x() + rect.get_width() / 2,height + 10,str(height),ha="center")

plt.show()
# 直方图
scoresT1 = np.random.randint(0,100,100)
scoresT2 = np.random.randint(0,100,100)
x = [scoresT1,scoresT2]
colors = ["g","b"]
labels = ["class A","class B"]
bins =range(0,101,10)
list(bins)
plt.hist(x,bins=bins,color=colors,histtype="bar",label=labels)
plt.legend(loc="upper left")
plt.show()
# 饼图
labels = ['one','two','three','four']
sizes = [2332,3413,23423,79832]
colors = ['r','g','y','b']
explode = (0.1,0,0,0)
plt.pie(sizes,labels=labels,colors=colors,explode=explode,shadow=True,radius=1.0,autopct="%1.1F%%")
plt.title("支付方式")
plt.show()
# 3D图
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure()
ax = Axes3D(fig)

X = np.arange(-4,4,0.25)
Y = np.arange(-4,4,0.25)
X,Y = np.meshgrid(X,Y)
Z = X ** 2 + Y ** 2
ax.scatter(X,Y,Z)

ax.plot_surface(X,Y,Z,rstride=1,cstride=2,cmap=plt.get_cmap('rainbow'))

plt.show()


### 案例2-商户支付数据分析案例
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_csv("data.csv",encoding="gbk")
data['商户名称']="隐藏商户名"

# 最近10天支付总额和订单数统计
plt.figure(figsize=(15,8))
data.dropna(axis=0,how='any',inplance=True) # 填充空数值
data['支付时间'] = pd.to_datetime(data['支付时间']) # 支付时间string to date
data['支付日期'] = [x.strftime("%Y-%m-%d") for x in data['支付时间']]
data2 = data.groupby(['支付日期'])['金额'].sum() # 按照时间,对金额进行汇总
x4 = data2.index[-10:]
y4 = data2[-10:]

rects1 = plt.bar(x4,height=y4,width=0.4,alpha=0.8,color='g')
plt.ylabel("订单日期",fontsize=15)
plt.xlabel("日期",fontsize=15)
plt.title("最近10天支付总额",fontsize=15)
for rect in rects1:
height = rect.get_height()
plt.text( rect.get_x() + rect.get_width() / 2, height + 1000, str(height), ha="center")
plt.show()

### 时间段统计
fig = plt.figure(figsize=(10,6))
data['几点'] = [x.strftime('%H') for x in data['支付时间']]

data5 = data.groupby('几点')['金额'].size()
x5 = data5.index
y5 = data5

plt.bar(x5,y5)
plt.xlabel("时间段",fontsize=15)
plt.ylabel("订单数",fontsize=15)
plt.title("时间段统计",fontsize=15)
plt.show()

数据分析

基础知识

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
缺失数据
数据缺失的原因
1.无意的
2.有意的
3.不存在
数据缺失的类型
完全随机缺失(mcar)
随机缺失(mar)
非随机缺失(mnar)
数据缺失的处理方法
删除记录
数据填补的方法
替换缺失值
拟合缺失值
虚拟变量
不处理


# 导入数据包
import pandas as pd
import numpy as np

# 常见缺失数据
dict = {
'one':[100,90,np.nan,95],
'two':[30,45,56,np.nan],
'three':[np.nan,40,80,90]
}

df = pd.DataFrame(dict)
# 判断数据是否不为空
df.notnull()
# 选择不为空的数据
bool_series = pd.isnull(df["one"])
df[bool_series]

# 缺失数据展示
%matplotlib inline
import missingno as msno
# 产生缺失数据集
def get_random_nan_list(list_data,num=100):
p = np.random.uniform(0,1,size=(len(list_data))) #随机采样
p = p / np.sum(p)
data = np.random.choice(list_data,num,p=p) # p为取每个元素的概率
return data
dict = {
'First Score':get_random_nan_list([100,np.nan,np.nan,95]),
'Second Score':get_random_nan_list([30,np.nan,45,56]),
'Third Score':get_random_nan_list([52,np.nan,80,98]),
'Fourth Score':get_random_nan_list([60,67,68,65]),
}

df = pd.DataFrame(dict)

## 填充
# 用0填充缺失值
df.fillna(0)
# 某一列用特定的数据填充
df['Second Score'].fillna(np.mean(df['Second Score']))
# 采用replace方法填充
df.replace(to_replace=np.nan,value=-99)
# 采用线性插值填充
df.interpolate(method='linear',limit_direction='forward')

## 删除
# 删除带有nan的数据
df.dropna()
# 删除全部都是nan的数据
df.dropna(how='all')
# 删除某列至少有一个nan的数据
df.dropna(axis=1)
# 删除行中至少又一个nan的数据
df.dropna(axis=0,how='any')

## sklearn
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN',strategy='mean',axis=0)
X = np.array([[1,2],[np.nan,3],[7,6]])
Y = [[np.nan,2],[6,np.nan],[7,6]]
imp.fit(X)
imp.transform(Y)

数据分析实战一

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import pandas as pd
import numpy as np
train = pd.read_csv('/home/admin/jupyter/download/tianchi_learn_project/data/1/titanic_train.csv')
test = pd.read_csv('/home/admin/jupyter/download/tianchi_learn_project/data/1/titanic_test.csv')

%matplotlib inline
# 导入缺失值数值图像展示的包
import missingno as msno

msno.matrix(train,labels=True) # 很有用,一眼可以看出数据的缺失情况

# 缺失值处理
train.fillna(0)
test.fillna(0)

# 用前一个值进行填充
train.fillna(method='pad')
# 后一个值
train.fillna(method='bfill')

train = train.fillna(method='pad',inplace=True) # 用前一个值填充
train['Age'] = train['Age'].fillna(np.mean(train['Age'])) # 用平均值填充
test['Age'] = test['Age'].fillna(np.mean(train['Age'])) # 用平均值填充
train.replace(to_replace=np.nan,value=-99) # 用指定值来进行替换

# 采用线性插值填充
# 采用线性插值填充
train.interpolate(method='linear',limit_direction='forward')
# 删除缺失值
train.dropna()

# 使用sklearn的Imputer方法填充
import matplotlib.pyplot as plt
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN',strategy='mean',axis=0)
imp.fit(train['Age'].values.reshape(-1,1))

train_age = imp.transform(train['Age'].values.reshape(-1,1))
test_age = imp.transform(test['Age'].values.reshape(-1,1))

# 取大于所有的99%的数
move_data = [np.percentile(train['Fare'],99) if d > np.percentile(train['Fare'],99) else d for d in train.Fare]
train.Fare.hist(bins=32)

move_data = [np.percentile(train['Fare'],99) if d > np.percentile(train['Fare'],99) else d for d in train.Fare]
plt.hist(move_data,bins=100)

np.max(move_data) # 求最大值

数据特征分析概要

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
统计分布分析
一维离散分布分析
一维连续分布分析
联合分布分析
统计量分析
均值
中位数
众数
极差
标准差
变异系数
分位数

import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

# 密度分布曲线
# 伯努利分布
X = np.arange(0,2,1)
p1 = 0.5
plist1 = stats.bernoulli.pmf(X,p1)
plt.plot(X,plist1,marker='o',linestyle='None')
plt.vlines(X,0,plist1)
# plt.xlabel('随机概率:抛硬币1次')
# plt.ylabel('概率')
# plt.title('伯努利分布:p1=%.2f' % p1)
plt.show()

n = 5
p2 = 0.5
X1 = np.arange(0,n+1,1)
plist2 = stats.binom.pmf(X1,n,p2)
plt.plot(X1,plist2,marker='o',linestyle='None')
plt.vlines(X1,0,plist2)
# plt.xlabel('随机概率: 抛硬币正面朝上次数')
# plt.ylabel('概率')
# plt.title('二项分布:n=%i,p2=%.2f' % (n,p2))
plt.show()

# 使用numpy和stats中的函数产生均值、方差、偏度、峰度
x = np.random.randn(10000)
mu = np.mean(x,axis=0)
sigma = np.std(x,axis=0)
skew = stats.skew(x)
kurtosis = stats.kurtosis(x)

%matplotlib inline
import pandas as pd
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15,6

data = pd.read_csv('/home/admin/jupyter/download/ai/python/data/3/AirPassengers.csv')
data['Month'] = pd.to_datetime(data['Month'])
data = data.set_index(['Month']) # 设置时间为索引
data.head()

# 分析时间序列
decomposition = seasonal_decompose(data)
trend = decomposition.trend
seasonal = decomposition.seasonal
resid = decomposition.resid
# 时间序列分解画面
plt.subplot(411)
plt.plot(data,label='Original')
plt.subplot(412)
plt.plot(trend,label='Trend')
plt.subplot(413)
plt.plot(seasonal,label='Seasonal')
plt.subplot(414)
plt.plot(resid,label='resid')
plt.tight_layout()
plt.show()

# 绘制C0特征和target的相关性
fcols = 2
frows = 1

plt.figure(figsize=(8,4))

ax = plt.subplot(1,2,1)
sns.regplot(x=df['C0'],y='target',data=df,ax=ax,
scatter_kws={'marker':'.','s':3,'alpha':0.3},
line_kws={'color':'k'}
);

plt.xlabel('C0')
plt.ylabel('target')

# 绘制c0的直方图
ax = plt.subplot(1,2,2)
sns.distplot(df['C0'].dropna())
plt.xlabel('C0')
plt.show()

# 相关性矩阵
df.corr()
# 画出相关性热力图
ax = plt.subplots(figsize=(20,16))
ax = sns.heatmap(df.corr(), vmax=.8, square=True, annot=True)

特征工程

基本特征工程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import numpy as np
import pandas as pd
train = pd.read_csv('/home/admin/jupyter/download/tianchi_learn_project/data/1/titanic_train.csv')
test = pd.read_csv('/home/admin/jupyter/download/tianchi_learn_project/data/1/titanic_test.csv')
df_all = pd.concat([train,test])[train.columns]
# 基本特征工程
# 清理数据 txt_string中在substrings的留下,其他置为空
def substrings_in_string(txt_string, substrings):
for substring in substrings:
if substring in txt_string:
return substring
return np.nan

title_list = ['Mrs','Mr','Master','Miss','Major','Rev','Dr',
'Ms','Mlle','Col','Capt','Mme','Countess','Don','Jonkheer'
]

df_all['Title'] = df_all['Name'].map(lambda x:substrings_in_string(x,title_list))

# 转换处理数据
cabin_list = ['A','B','C','D','E','F','T','G','Unknown']
df_all['Deck'] = df_all['Cabin'].map(lambda x:substrings_in_string(str(x),cabin_list))

# 人工抽取特征
df_all['Family_Size'] = df_all['SibSp'] + df_all['Parch']
df_all['Age*Class'] = df_all['Age'] * df_all['Pclass']
df_all['Fare_Per_Person'] = df_all['Fare'] / (df_all['Family_Size'] + 1)

# 删除无用特征
def_col = ['Name','Ticket','Cabin']
for col in del_col:
del df_all[col]

# 离散变量处理
for col in ['Sex','Embarked','Title','Deck']:
df_temp = pd.get_dummies(df_all[col])
df_temp.columns = [col+c for c in df_temp]
df_all = pd.concat([df_all, df_temp])
del df_all[col]

# 连续值归一化处理
for col in ['Pclass','Age','Fare','Age*Class','Fare_Per_Person']:
df_all[col] = (df_all - np.min(df_all[col])) / (np.max(df_all[col]) - np.min(df_all[col]))

采样

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import numpy as np
# 有放回采样 Random_Sampling
def random_sampling_back(data, sample_num, p=0.5):
sample_cnt = 0
len_data = len(data)
sample_data = []
while(1):
for d in data:
rp = np.random.uniform(0,1)
if rp > p:
sample_data.append(d)
sample_cnt += 1

if sample_cnt >= sample_num:
return sample_data

data = list(np.random.random(100))
data_sample = random_sampling_back(data,10,p=0.5)

# 无放回采样 Random Sampling
def random_sampling_no_back(data,sample_num,p=0.5):
sample_cnt = 0
len_data = len(data)
sample_data = []
sample_map = {}

if sample_num >= len_data:
raise RuntimeError('采样样本不能超过数据集')

while(1):
for d in data:
rp = np.random.uniform(0,1)
if (rp > p) and (d not in sample_map):
sample_data.append(d)
sample_cnt += 1
sample_map[d] = 1

if sample_cnt >= sample_num:
return sample_data
data_sample_no_back = random_sampling_no_back(data,10,p=0.5)
data_sample_no_back

# 分层采样
data1 = [1] * 10
data2 = [2] * 20
data3 = [3] * 30

data_sample = []
data_sample += random_sampling_back(data1, 5) + random_sampling_back(data2, 10) + random_sampling_back(data3, 15)

# 按概率采样
def sampling_prob(data,num,p_list):
data_len = len(data)
if data_len != len(p_list):
raise RuntimeError('需要设置单个样本采样概率。。。数据量和概率值一致')
return list(np.random.choice(data,num,p=p_list))

data1 = list(range(100))
p_list = np.array(data1) / np.sum(data1)
sampling_prob(data1, 10, p_list)

## 不平衡样本学习
import pandas as pd
from sklearn.datasets import make_classification

# 产生分类数据
X,y = make_classification(
n_classes=2, class_sep=1.5, weights=[0.9, 0.1],
n_informative=3, n_redundant=1, flip_y=0,
n_features=20, n_clusters_per_class=1,
n_samples=100, random_state=10
)

df = pd.DataFrame(X)
df['target'] = y
df.target.value_counts().plot(kind='bar', title='Count (target)')

# 画图展示比例
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt

conf_mat = confusion_matrix(y_true=y, y_pred=y)
print('Confusion matrix:\n',conf_mat)

labels = ['Class 0','Class 1']
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(conf_mat,cmap=plt.cm.Blues)
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('Expected')
plt.show()

# 统计label计数
count_class_0,count_class_1 = df.target.value_counts()
print(count_class_0,count_class_1)

# 选择标签样本
df_class_0 = df[df['target'] == 0]
df_class_1 = df[df['target'] == 1]

# 随机过采样标签数据
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over],axis=0)

print('random over-sampling:')
print(df_test_over.target.value_counts())

df_test_over.target.value_counts().plot(kind='bar', title='Count (target)')

降维

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import seaborn as sns
X,y = make_classification(
n_classes=2,class_sep=1.5,weights=[0.5,0.5],
n_informative=3,n_redundant=1,flip_y=0,
n_features=20,n_clusters_per_class=1,
n_samples=100,random_state=10
)
df = pd.DataFrame(X)
df['target'] = y

# sklearn pca
# 切分数据
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)
# 数据正则化(归一化)
x_train_N = (x_train - x_train.mean()) / (x_train.max() - x_train.min())
x_train_N = (x_test - x_test.mean()) / (x_test.max() - x_test.min())

from sklearn.decomposition import PCA
pca = PCA()
pca.fit(x_train_N)
plt.figure(1, figsize=(14,13))
plt.clf()
plt.axes([.2,.2,.4,.4])
plt.plot(pca.explained_variance_ratio_,linewidth=2)
plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance_ratio_')
plt.show()

## 特征选择
# 相关性系数选择
def sim(x1,x2):
x1 = (x1 - np.min(x1)) / (np.max(x1) - np.min(x1))
x2 = (x2 - np.min(x2)) / (np.max(x2) - np.min(x2))
vec = np.sum(x1 * x2)
vec_x1 = np.sqrt(np.sum(x1 ** 2))
vec_x2 = np.sqrt(np.sum(x2 ** 2))
return vec / (vec_x1 * vec_x2)

# 过滤相关性低的样本
col_list = df.columns
col_sim = []
for col in col_list:
col_sim.append(sim(df[col],df['target']))

plt.figure(1,figsize=(14,13))
plt.title("Feature Similarity Importances")
plt.bar(range(len(col_list)), [int(x*100) for x in col_sim], color = "g",yerr = 1,align="center")
plt.xticks(range(len(col_list)), col_list, rotation=90)
plt.xlim([-1, len(col_list)])
plt.show()

# 过滤法
# 相关性选择
def col_selector(X, y, top_k=10):
cor_list = []
# calculate the correlation with y for each feature
for i in X.columns.tolist():
cor = np.corrcoef(X[i],y)[0,1]
cor_list.append(cor)
# replace Nan with 0
cor_list = [0 if np.isnan(i) else i for i in cor_list]
# feature name
cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-top_k:]]

return cor_feature
cor_feature = col_selector(pd.DataFrame(X),y) # 选出了相关性比较高的10个特征

# sklearn chi2选择
X = pd.DataFrame(X)

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2,k=10)
chi_selector.fit(X_norm,y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()

特征工程项目一(数据分析[清洗])

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import pandas as pd
import numpy as np
# 读取数据
model_sample = pd.read_csv('/home/admin/jupyter/download/tianchi_learn_project/data/3/model_sample.csv')
model_sample.y.value_counts()

model_sample.x_008.value_counts()

%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

# 查看数据特征,直方图
columns = [c for c in model_sample.columns if c not in ['user_id']][:10]
fcols = 6
frows = len(columns)
plt.figure(figsize= (5*fcols,4*frows))

i=0
for col in columns:
i+=1
ax = plt.subplot(frows,fcols,i)
model_sample[col].hist()
plt.xlabel(col)
plt.ylabel('nums')

特征工程项目二(数据采样)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# 数据采样
%matplotlib inline
import pandas as pd

data = pd.read_csv('/home/admin/jupyter/download/tianchi_learn_project/data/3/model_sample.csv')

# 无效矩阵的数据密度显示
import missingno as msno
msno.matrix(data,labels=True)

# 查看数据标签对比
data.y.value_counts()
# 统计label计数
count_class_0,count_class_1 = data.y.value_counts()
print(count_class_0,count_class_1)

# 选择标签样本
df_class_0 = data[data['y'] == 0]
df_class_1 = data[data['y'] == 1]

# 随机过采样标签数据
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over],axis=0)

# 数据切分
from sklearn.model_selection import StratifiedKFold
SK = StratifiedKFold(n_splits=5,shuffle=True,random_state=1)
features_columns = [c for c in data.columns if c not in ['y','user_id']]
X = data[features_columns].values
y = data['y'].values
for k,(train_index,test_index) in enumerate(SK.split(X,y)):
print('选择哪些样本的index:')
print(train_index,test_index)

# 数据从样本筛选出 训练集和测试集
X_train = X[train_index]
y_train = y[train_index]

X_test = X[test_index]
y_test = y[test_index]

# 样本的个数
print('样本个数:')
print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))

特征工程项目三(数据降维)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import pandas as pd
import numpy as np
# 读取相关数据
model_sample = pd.read_csv('/home/admin/jupyter/download/tianchi_learn_project/data/3/model_sample.csv')
# 相关系数进行初步选择

# 缺失值填充mean 均值填充
columns = [c for c in model_sample.columns if c not in ['user_id','y']]
for col in columns:
model_sample[col] = model_sample[col].fillna(np.mean(model_sample[col]))

# norm 归一化处理
def max_min(data, col, epsilon=1e-5, fillna=0):
data_norm = data.copy()
for col_i in col:
data_norm[col_i] = (data[col_i] - data.min()) / (data[col_i].max() - data[col_i].min())
data_norm = data_norm.fillna(fillna)
return data_norm

data_norm = max_min(model_sample,columns)

def mean_std(data,col,epsilon=1e-5,fillna=0):
data_norm = data.copy()
for col_i in col:
data_norm[col_i] = (data[col_i] - data[col_i].mean()) / (data[col_i].std() + epsilon)
data_norm = data_norm.fillna(fillna)
return data_norm
data_norm = mean_std(model_sample,columns)

# 计算相关系数矩阵
def corr_(x,y):
return np.abs(np.corrcoef(x,y)[0][1])

def corr_col(data_norm, col, y='y', percentile=0.25):
col_corr = []
for col_i in col:
col_corr.append(corr_(data_norm[col_i],data_norm[y]))
df_corr = pd.DataFrame({'col':col,'corr':col_corr}).dropna()
col_corr_ = df_corr['corr']
col_corr_percentile = np.percentile(col_corr_,percentile)
return df_corr, col_corr, col_corr_percentile

机器学习

sklearn

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from sklearn import datasets
iris = datasets.load_iris()
digits = datasets.load_digits()

# 手写数字
from sklearn import svm
clf = svm.SVC(gamma=0.001, C=100.)
clf.fit(digits.data[:-1], digits.target[:-1])
clf.predict(digits.data[-1:])

import matplotlib.pyplot as plt
%matplotlib inline

plt.imshow(digits.images[-1], cmap=plt.cm.gray_r)

# 鸢尾花
clf = svm.SVC()
X,y = iris.data, iris.target
clf.fit(X,y)
clf.predict(iris.data[-1:])

# 模型保存
import pickle
s = pickle.dumps(clf)
clf2 = pickle.loads(s)
clf2.predict(X[0:1])
from joblib import dump, load
dump(clf, 'filename.joblib')
clf = load('filename.joblib')

# 二分类转化为多分类
from sklearn.multiclass import OneVsRestClassifier

X = [[1,2], [2,4], [4,5], [3,2], [3,1]]
y = [0,0,1,1,2]

classif = OneVsRestClassifier(estimator=svm.SVC(random_state=0))
classif.fit(X,y).predict(X)

# 多标签
from sklearn.preprocessing import MultiLabelBinarizer

y = [[0,1], [0,2], [1,3], [0,2,3], [2,4]]
y = MultiLabelBinarizer().fit_transform(y)
y
classif.fit(X,y).predict(X)

项目案例一

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import numpy as np
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data
y = iris.target

X[:10,:] # 前10行,所有列

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=666) # 训练集和测试集

from sklearn.preprocessing import StandardScaler # 标准化,可以得到一些标准化的参数

standardScaler = StandardScaler()

standardScaler.fit(X_train)
standardScaler.mean_
standardScaler.scale_

X_train_stand = standardScaler.transform(X_train) # 得到训练集
X_train_stand[:10,:]
X_test_stand = standardScaler.transform(X_test) # 测试集
X_test_stand[:10,:]

from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train,y_train)

knn_clf.score(X_test_stand, y_test)

knn_clf.score(X_test, y_test)

knn_clf.predict(X_test_stand[-1:,:])
# 预测值和真实值进行对比
y_test[-1]

# 第二个案例
digits = datasets.load_digits()

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.2, random_state=666)

sk_knn_clf = KNeighborsClassifier(n_neighbors=4,weights='uniform')
sk_knn_clf.fit(X_train,y_train)
sk_knn_clf.score(X_test,y_test) # 模型准确度测试

# 寻找最优参数
param_grid = [
{
# 搜索的第一组参数
'weights':['uniform'],
'n_neighbors':[i for i in range(1,11)]
},
{
# 搜索的第二组参数
'weights':['distance'],
'n_neighbors':[i for i in range(1,11)]
}
]
knn_clf = KNeighborsClassifier()
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(knn_clf, param_grid)
grid_search.fit(X_train,y_train)
knn_clf = grid_search.best_estimator_
grid_search.best_score_ # 最好的准确度
grid_search.best_params_ # 对应的参数
knn_clf.score(X_test,y_test) # 模型准确度测试

项目案例二

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import pandas as pd
df = pd.read_csv('/home/admin/jupyter/download/data/4/BostonHousing.csv')
df.head()

target = df['medv']

split_num = int(len(features) * 0.7)

X_train = features[:split_num] # 训练集 前70%
y_train = target[:split_num]

X_test = features[split_num:] # 测试集 后70%
y_test = target[split_num:]

from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train,y_train)
model.coef_, model.intercept_

preds = model.predict(X_test)

def mae_value(y_true, y_pred):
n = len(y_true)
mae = sum(np.abs(y_true - y_pred)) / n
return mae

def mse_value(y_true, y_pred):
n = len(y_true)
mse = sum(np.square(y_true - y_pred)) / n
return mse

mae = mae_value(y_test.values, preds)
mse = mse_value(y_test.values, preds)
print('MAE:', mae)
print('MSE:', mse)

线性回归

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# 1.线性回归介绍
# 2.一元线性回归的实现
# 3.平方损失函数
# 4.最小二乘法及代数求解
# 5.线性回归的实现
# 6.最小二乘法的矩阵推导

# sklearn方式
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x.reshape(len(x),1),y)
model.intercept_, model.coef_

y_temp = model.predict(x.reshape(len(x),1))

plt.scatter(x.tolist(),y.tolist())
plt.plot(x, y_temp, 'r')

# 小技巧
from numpy import *
# 删除numpy矩阵中的一行或者一列
x=delete(x,0,axis=1)

多项式回归

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# 1.多项式回归介绍
# 2.多项式回归基础

# 使用sklearn方法
x = [4, 8, 12, 25, 32, 43, 58, 63, 69, 79]
y = [20, 33, 50, 56, 42, 31, 33, 46, 65, 75]
from sklearn.preprocessing import PolynomialFeatures

x = np.array(x).reshape(len(x), 1)
y = np.array(y).reshape(len(y), 1)

poly_features = PolynomialFeatures(degree=2, include_bias=False)
poly_x = poly_features.fit_transform(x)

from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(poly_x, y)
model.intercept_, model.coef_

x_temp = np.array(x_temp).reshape(len(x_temp), 1)
poly_x_temp = poly_features.fit_transform(x_temp)

plt.plot(x_temp, model.predict(poly_x_temp), 'r')
plt.scatter(x,y)

# 多项式回归实战
import pandas as pd

df = pd.read_csv("/home/admin/jupyter/download/tianchi_learn_project/data/5/course-6-vaccine.csv")

# 测试数据集的分离
train_df = df[:int(len(df)* 0.7)]
test_df = df[int(len(df)* 0.7):]

X_train = train_df['Year'].values
y_train = train_df['Values'].values

X_test = test_df['Year'].values
y_test = test_df['Values'].values

from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train.reshape(len(X_train), 1),y_train.reshape(len(X_train), 1))

results = model.predict(X_test.reshape(len(X_test), 1))
results # 线性回归在测试集上预测的结果

plt.scatter(X_test,y_test)
plt.plot(X_test,results,'r') # 可视化

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

print('线性回归平均绝对误差:',mean_absolute_error(y_test, results.flatten()))
print('线性回归均方误差:',mean_squared_error(y_test, results.flatten()))

# make_pipeline 合并 PolynomialFeatures + LinearRegression
from sklearn.pipeline import make_pipeline

X_train = X_train.reshape(len(X_train), 1)
X_test = X_test.reshape(len(X_test), 1)
y_train = y_train.reshape(len(y_train), 1)

for m in[3,4,5]:
model = make_pipeline(PolynomialFeatures(m,include_bias=False), LinearRegression())
model.fit(X_train, y_train)
pre_y = model.predict(X_test)
print('{}次多项式回归平均绝对误差:'.format(m),mean_absolute_error(y_test, pre_y.flatten()))
print('{}次多项式回归均方误差:'.format(m),mean_squared_error(y_test, pre_y.flatten()))

# 计算mse随着多项式次数的增加的变化曲线

mse = []
m = 1
m_max = 10
while m <= m_max:
model = make_pipeline(PolynomialFeatures(m,include_bias=False), LinearRegression())
model.fit(X_train, y_train)
pre_y = model.predict(X_test)

mse.append(mean_squared_error(y_test,pre_y.flatten()))

m+=1

plt.plot([i for i in range(1, m_max+1)], mse, 'r')
plt.scatter([i for i in range(1, m_max+1)], mse)
plt.title('MSE of m degree of polynomial regression')
plt.xlabel('m')
plt.ylabel('MSE')

k近邻

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# 1.最近邻算法
# 2.k近邻算法
# 3.knn算法流程
# 4.度量距离
# 5.决策规划
# 6.knn算法实现

# k近邻
# 1.数据准备
# 2.计算距离
# 3.寻找邻居
# 4.决策分类


# sklearn实现
import pandas as pd

lilac_data = pd.read_csv('/home/admin/jupyter/download/tianchi_learn_project/data/6/course-9-syringa.csv')
from matplotlib import pyplot as plt
%matplotlib inline

'''绘制丁香花特征子图'''
fig, axes = plt.subplots(2, 3, figsize=(20,10))
fig.subplots_adjust(hspace=0.3, wspace=0.2)

axes[0, 0].set_xlabel('sepal_length')
axes[0, 0].set_ylabel('sepal_width')
axes[0, 0].scatter(lilac_data.sepal_length[:50], lilac_data.sepal_width[:50], c='b')
axes[0, 0].scatter(lilac_data.sepal_length[50:100], lilac_data.sepal_width[50:100], c='g')
axes[0, 0].scatter(lilac_data.sepal_length[100:], lilac_data.sepal_width[100:], c='r')
axes[0, 0].legend(['daphne','syringa', 'willow'], loc=2)

axes[0, 1].set_xlabel('petal_length')
axes[0, 1].set_ylabel('petal_width')
axes[0, 1].scatter(lilac_data.petal_length[:50], lilac_data.petal_width[:50], c='b')
axes[0, 1].scatter(lilac_data.petal_length[50:100], lilac_data.petal_width[50:100], c='g')
axes[0, 1].scatter(lilac_data.petal_length[100:], lilac_data.petal_width[100:], c='r')

axes[0, 2].set_xlabel('sepal_length')
axes[0, 2].set_ylabel('petal_length')
axes[0, 2].scatter(lilac_data.sepal_length[:50], lilac_data.petal_length[:50], c='b')
axes[0, 2].scatter(lilac_data.sepal_length[50:100], lilac_data.petal_length[50:100], c='g')
axes[0, 2].scatter(lilac_data.sepal_length[100:], lilac_data.petal_length[100:], c='r')

axes[1, 0].set_xlabel('sepal_width')
axes[1, 0].set_ylabel('petal_width')
axes[1, 0].scatter(lilac_data.sepal_width[:50], lilac_data.petal_width[:50], c='b')
axes[1, 0].scatter(lilac_data.sepal_width[50:100], lilac_data.petal_width[50:100], c='g')
axes[1, 0].scatter(lilac_data.sepal_width[100:], lilac_data.petal_width[100:], c='r')

axes[1, 1].set_xlabel('sepal_length')
axes[1, 1].set_ylabel('petal_width')
axes[1, 1].scatter(lilac_data.sepal_length[:50], lilac_data.petal_width[:50], c='b')
axes[1, 1].scatter(lilac_data.sepal_length[50:100], lilac_data.petal_width[50:100], c='g')
axes[1, 1].scatter(lilac_data.sepal_length[100:], lilac_data.petal_width[100:], c='r')

axes[1, 2].set_xlabel('sepal_width')
axes[1, 2].set_ylabel('petal_length')
axes[1, 2].scatter(lilac_data.sepal_width[:50], lilac_data.petal_length[:50], c='b')
axes[1, 2].scatter(lilac_data.sepal_width[50:100], lilac_data.petal_length[50:100], c='g')
axes[1, 2].scatter(lilac_data.sepal_width[100:], lilac_data.petal_length[100:], c='r')

# 分离数据集
from sklearn.model_selection import train_test_split

feature_data = lilac_data.iloc[:,:-1]
label_data = lilac_data['labels']

X_train,X_test,y_train,y_test = train_test_split(feature_data, label_data, test_size=0.3, random_state=2)

X_test.head()

from sklearn.neighbors import KNeighborsClassifier
import numpy as np

def sklearn_classify(train_data, label_data, test_data, k_num):
knn = KNeighborsClassifier(n_neighbors=k_num)
knn.fit(train_data, label_data)

predict_label = knn.predict(test_data)

return predict_label

y_predict = sklearn_classify(X_train, y_train, X_test, 3)
y_predict

# 准确率 = 正确的样本/总测试样本
def get_accuracy(test_labels, pred_labels):
correct = np.sum(test_labels == pred_labels) # 预测正确的个数

n = len(test_labels)

accur = correct / n
return accur

get_accuracy(y_test,y_predict)

# 查看k取值不同,准确率的变化
normal_accuracy = []
k_value = range(2, 11)
for k in k_value:
y_predict = sklearn_classify(X_train, y_train, X_test, k)

accuracy = get_accuracy(y_test,y_predict)

normal_accuracy.append(accuracy)

plt.xlabel('k')
plt.ylabel('accuracy')
new_ticks = np.linspace(0.6, 0.9, 10)

plt.yticks(new_ticks)
plt.plot(k_value, normal_accuracy, c='r')
plt.grid(True) # 增加网格

# sklearn 使用kd树 (时间可以更快?)
kd_x = np.random.random((100000,2))
kd_y = np.random.randint(4, size=100000)

kd_knn = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree')

支持向量机

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# 1.支持向量机介绍
# 2.线性支持向量机分类
# 3.非线性支持向量机分类

from sklearn.datasets import samples_generator

import matplotlib.pyplot as plt
%matplotlib inline

# 生成团状的数据
x, y = samples_generator.make_blobs(n_samples=60, centers=2 ,random_state=30, cluster_std=0.8)

plt.figure(figsize=(10, 8))
plt.scatter(x[:,0], x[:,1], c=y, s=40, cmap='bwr')

# 支持向量机是要找到一条最合理的线
from sklearn.svm import SVC

linear_svc = SVC(kernel='linear')
linear_svc.fit(x, y)

# 获取支持向量
linear_svc.support_vectors_

def svc_plot(model):
# 获得当前的Axes对象
ax = plt.gca()
x = np.linspace(ax.get_xlim()[0], ax.get_xlim()[1], 50)
y = np.linspace(ax.get_ylim()[0], ax.get_ylim()[1], 50)

# 生成网格数据,x:所有网格点的x坐标,形状也是网格性nxm。yy同样
Y,X = np.meshgrid(y, x)

# 元组转数组
xy = np.vstack([X.ravel(), Y.ravel()]).T

# 计算样本点到分割超平面的函数距离
P = model.decision_function(xy).reshape(X.shape)

# 对网格中每个点的值等于一系列值的时候做出一条条轮廓线,类似于等高线
ax.contour(X,Y,P, color='g', levels=[-1,0,1], linestyle=['--','-','--'])

ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[:, 1], c='g', s=100)

plt.figure(figsize=(10, 8))
plt.scatter(x[:,0], x[:,1], c=y, s=40, cmap='bwr')
svc_plot(linear_svc)

## 非线性支持向量机分类
x2, y2 = samples_generator.make_circles(150, factor=.5, noise=.1, random_state=30)

plt.figure(figsize=(8, 8))
plt.scatter(x2[:, 0], x2[:, 1], c=y2, s=40, cmap='bwr')

# 核函数(采用多项式核函数)
def kernel_function(xi, xj):
poly = xi**2 + xj**2
return poly

from mpl_toolkits import mplot3d
from ipywidgets import interact, fixed

r = kernel_function(x2[:,0], x2[:,1])
plt.figure(figsize=(10, 8))
ax = plt.subplot(projection='3d')
ax.scatter3D(x2[:, 0], x2[:, 1], r, c=y2, s=40, cmap='bwr')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('r')

rbf_svc = SVC(kernel='rbf', gamma='auto')
rbf_svc.fit(x2,y2)

plt.figure(figsize=(8, 8))
plt.scatter(x2[:,0], x2[:,1], c=y2, s=40, cmap='bwr')
svc_plot(rbf_svc)

## 支持向量机项目案例
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import datasets

irirs = datasets.load_iris()

X = irirs.data
y = irirs.target

X = X[y<2, :2]
y = y[y<2]

plt.scatter(X[y==0, 0], X[y==0, 1], color='red')
plt.scatter(X[y==1, 0], X[y==1, 1], color='blue')

# 标准化处理
from sklearn.preprocessing import StandardScaler

standardScaler = StandardScaler()
standardScaler.fit(X)

X_standard = standardScaler.transform(X)

# 支持向量机模型
from sklearn.svm import LinearSVC

svc = LinearSVC(C=1e9)
svc.fit(X_standard, y)

def plot_decision_boundary(model, axis):

x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)

X_new = np.c_[x0.ravel(), x1.ravel()]

y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)

from matplotlib.colors import ListedColormap
custom_map = ListedColormap(['#ef9a9a', '#fff59d', '#90caf9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_map)

plot_decision_boundary(svc, axis=[-3, 3, -3, 3])
plt.scatter(X_standard[y==0, 0], X_standard[y==0, 1], color='red')
plt.scatter(X_standard[y==1, 0], X_standard[y==1, 1], color='blue')

决策树

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# 1.什么是决策树
# 2.决策树算法流程
# 3.信息增益(id3)
# 4.连续值处理
# 5.决策树的实现

import numpy as np
import pandas as pd

stu_grade = pd.read_csv('/home/admin/jupyter/download/data/10/students_academic_performance.csv')
stu_grade.head()

new_data = stu_grade.iloc[:, [0,1,2,3,4,5,6,8,9,10,11,14,15]]
new_data.head()

def choice_2(x):
x = int(x)
if x < 5:
return 'bad'
elif x >= 5 and x < 10:
return 'medium'
elif x >= 10 and x < 15:
return 'good'
else:
return 'excellent'

stu_data = new_data.copy()
stu_data['G1'] = pd.Series(map(lambda x: choice_2(x), stu_data['G1']))
stu_data['G2'] = pd.Series(map(lambda x: choice_2(x), stu_data['G2']))
stu_data['G3'] = pd.Series(map(lambda x: choice_2(x), stu_data['G3']))

def choice_3(x):
x = int(x)
if x < 3:
return 'high'
elif x > 1.5:
return 'medium'
else:
return 'low'

stu_data['Pedu'] = pd.Series(map(lambda x: choice_3(x), stu_data['Pedu']))
stu_data.head()

# 特征
def replace_feature(data):
for each in data.columns:
feature_list = data[each]
unique_value = set(feature_list)
i = 0
for fea_value in unique_value:
data[each] = data[each].replace(fea_value, i)
i += 1

return data

stu_data = replace_feature(stu_data)
stu_data.head()

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(stu_data.iloc[:, :-1], stu_data['G3'], test_size=0.3, random_state=5)
X_test.head()

from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(criterion='entropy', random_state=666)
dt_model.fit(X_train)

y_pred = dt_model.predict(X_test)
y_pred

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)