文章目录

计算下Z（矩阵）
- 1.1 一般参数形式
- 1.2 简化形式
- - Z
- 1.3 矩阵形式
- - 1.3.2 Z
2.维特比算法
3.前向算法
4.后向算法
5.使用前向后向的概率计算
6.期望计算
7.参数估计（学习）
- 7.1 梯度上升
参考文献

CRF 是无向图模型code
- 它是一个判别式模型
- 建模了每个状态和整个观测序列的依赖

https://www.unclewang.info/learn/machine-learning/756/
https://blog.csdn.net/u012421852/article/details/80287567

计算下Z（矩阵）

import numpy as np

y0=1#start
y4=1#stop

从start到stop的所有路径的规范化因子Z，其实就是上面所有路径的非规范化概率之和

class CCRF(object):"""条件随机场的矩阵表示实现"""def __init__(self,M):self.M=M#条件随机场的矩阵形式的存储体self.Z=None#规范化因子self.MP=[]#矩阵乘积self.work()returndef work(self):print('work......')self.MP=np.full(shape=(np.shape(self.M[0])),fill_value=1.0)
#         print(self.MP)for i in range(np.shape(self.M)[0]):#四个矩阵就循环四次print('\nML=\n',self.MP)print('M%d=\n'%i,self.M[i])self.MP=np.dot(self.MP,self.M[i])#矩阵乘法print('dot=\n',self.MP)def ZValue(self):return self.MP[0,0]def CCRF_manual():M1 = np.array([[0.5, 0.5],[0,   0]])#a01 a02-AM2 = np.array([[0.3, 0.7],[0.7, 0.3]])#b11,b12,b21,b22-BM3 = np.array([[0.5, 0.5],[0.6, 0.4]])M4 = np.array([[1, 0],[1, 0]])M=[]M.append(M1)M.append(M2)M.append(M3)M.append(M4)M=np.array(M)print('CRF 矩阵：\n',M)crf=CCRF(M)ret=crf.ZValue()print('从start到stop的规范因子Z:',ret)if __name__=='__main__':CCRF_manual()

CRF 矩阵：[[[0.5 0.5][0.  0. ]][[0.3 0.7][0.7 0.3]][[0.5 0.5][0.6 0.4]][[1.  0. ][1.  0. ]]]
work......ML=[[1. 1.][1. 1.]]
M0=[[0.5 0.5][0.  0. ]]
dot=[[0.5 0.5][0.5 0.5]]ML=[[0.5 0.5][0.5 0.5]]
M1=[[0.3 0.7][0.7 0.3]]
dot=[[0.5 0.5][0.5 0.5]]ML=[[0.5 0.5][0.5 0.5]]
M2=[[0.5 0.5][0.6 0.4]]
dot=[[0.55 0.45][0.55 0.45]]ML=[[0.55 0.45][0.55 0.45]]
M3=[[1. 0.][1. 0.]]
dot=[[1. 0.][1. 0.]]
从start到stop的规范因子Z: 1.0

1.1 一般参数形式

在这里插入图片描述

	import torchimport torch.nn as nn

	sequence_len = 3;y_size = 2;k=5l=4#转移#每个t有k组，每个y有2种状态，y1->y2,y2->y3,序列长3（k,i=1,2,y_i->y_i+1)t=torch.tensor( [[[[0,1],[0,0]],[[0,1],[0,0]]],[[[1,0],[0,0]],[[0,0],[0,0]]],[[[0,0],[0,0]],[[0,0],[1,0]]],[[[0,0],[1,0]],[[0,0],[0,0]]],[[[0,0],[0,0]],[[0,0],[0,1]]]],dtype=float);lamb=torch.tensor([1,0.5,1,1,0.2])# 发射# 序列长3，每个y和x出现的情况(l,t(y),状态）s=torch.tensor( [[[1,0],[0,0],[0,0]],[[0,1],[0,1],[0,0]],[[0,0],[1,0],[1,0]],[[0,0],[0,0],[0,1]]],dtype=float)mu = torch.tensor([1, 0.5, 0.8, 0.5])#比上面多了个开始y0和结束y4def P_y_x_condition(y):# 参数形式sumt=0sums=0for i in range(k):for j in range(len(y)-1):sumt+=lamb[i]*t[i,j,y[j],y[j+1]]# print(i,j,lamb[i]*t[i,j,y[j],y[j+1]])for i in range(l):for j in range(len(y)):sums+=mu[i]*s[i,j,y[j]]print(sums+sumt)return torch.exp(sums+sumt)y=[0,1,1]print("p(y|x)=p(y1=1,y2=2,y3=3|x)=",P_y_x_condition(y))

tensor(3.2000, dtype=torch.float64)
p(y|x)=p(y1=1,y2=2,y3=3|x)= tensor(24.5325, dtype=torch.float64)

1.2 简化形式

在这里插入图片描述

这里也引入了起点，下面代码中引入了起点和终点（只要起点就行）

f=torch.tensor([ [[[0,0],[0,0]],[[0,1],[0,0]],[[0,1],[0,0]],[[0,0],[0,0]]],[[[0,0],[0,0]],[[1,0],[0,0]],[[0,0],[0,0]],[[0,0],[0,0]]],[[[0,0],[0,0]],[[0,0],[0,0]],[[0,0],[1,0]],[[0,0],[0,0]]],[[[0,0],[0,0]],[[0,0],[1,0]],[[0,0],[0,0]],[[0,0],[0,0]]],[[[0,0],[0,0]],[[0,0],[0,0]],[[0,0],[0,1]],[[0,0],[0,0]]],[[[1,0],[1,0]],[[0,0],[0,0]],[[0,0],[0,0]],[[0,0],[0,0]]],[[[0,1],[0,1]],[[0,1],[0,1]],[[0,0],[0,0]],[[0,0],[0,0]]],[[[0,0],[0,0]],[[1,0],[1,0]],[[1,0],[1,0]],[[0,0],[0,0]]],[[[0,0],[0,0]],[[0,0],[0,0]],[[0,1],[0,1]],[[0,0],[0,0]]]],dtype=float);
w=torch.tensor([1,0.5,1,1,0.2,1, 0.5, 0.8, 0.5])
def P_y_x_condition_with_f( y):sum=0for i in range(k+l):for j in range(len(y)-1):sum+=w[i]*f[i,j,y[j],y[j+1]]print(sum)return torch.exp(sum)
p_y_x_con=P_y_x_condition_with_f([0,0,1,1,0])
print("p(y|x)=p(y1=1,y2=2,y3=3|x)=",p_y_x_con)

tensor(3.2000, dtype=torch.float64)
p(y|x)=p(y1=1,y2=2,y3=3|x)= tensor(24.5325, dtype=torch.float64)

Z

在这里插入图片描述

1.3 矩阵形式

在这里插入图片描述
a01表示从start=y0=1到y1=1的概率，
b21表示从y1=2到y2=1的概率

w=torch.tensor([1,0.5,1,1,0.2,1, 0.5, 0.8, 0.5])
M=f;
# print(M[0])
for i in range(k+l):M[i]=w[i]*f[i]
print(torch.sum(M,axis=0))
M=torch.exp(torch.sum(M,axis=0))
print("M(i,y_i-1,y_i):\n",M)
# 因为y0=0,yn+1=0
# 所以可以令M[0,1,0],M[0,1,1],M[3,0,1],M[3,1,1]=0
# M[0,1,0]=M[0,1,1]=M[3,0,1]=M[3,1,1]=0
# print("M(i,y_i-1,y_i):\n",M)#与上图对应上了

tensor([[[1.0000, 0.5000],[1.0000, 0.5000]],[[1.3000, 1.5000],[1.8000, 0.5000]],[[0.8000, 1.5000],[1.8000, 0.7000]],[[0.0000, 0.0000],[0.0000, 0.0000]]], dtype=torch.float64)
M(i,y_i-1,y_i):tensor([[[2.7183, 1.6487],[2.7183, 1.6487]],[[3.6693, 4.4817],[6.0496, 1.6487]],[[2.2255, 4.4817],[6.0496, 2.0138]],[[1.0000, 1.0000],[1.0000, 1.0000]]], dtype=torch.float64)

1.3.2 Z

$Z=(M_1(x)M_2(x)...M_{n+1}(x))_{start,stop}$
从start到stop对应于y=(1,1,1),y=(1,1,2), …, y=(2,2,2)个路径的非规范化概率分别是：

a01b11c11，a01b11c12，a01b12c21，a01b12c22

  a02b21c11，a01b21c12，a02b22c21，a02b22c22

然后按式11.12求规范化因子，通过计算矩阵乘积M1(x) M2(x) M3(x) M4(x)可知，其第一行第一列的元素为

a01b11c11+ a01b11c12 + a01b12c21+ a01b12c22 +a02b21c11 + a01b21c12+ a02b22c21 + a02b22c22

恰好等于从start到stop的所有路径的非规范化概率之和，即规范化因子Z(x)。

def Z_M(M):z=M[0]for i in range(1,sequence_len+1):z=torch.matmul(z,M[i])return z[0,0]
print(Z_M(M))

tensor(253.9492, dtype=torch.float64)

def P_y_x_condition_with_M(y):p=1;for i in range(len(y)-1):p*=M[i,y[i],y[i+1]]print(p)return p/Z_M(M)
p_y_x_con=P_y_x_condition_with_M([0,0,1,1,0])
print("p(y|x)=p(y1=1,y2=2,y3=3|x)=",p_y_x_con)

tensor(24.5325, dtype=torch.float64)
p(y|x)=p(y1=1,y2=2,y3=3|x)= tensor(0.0966, dtype=torch.float64)

2.维特比算法

在这里插入图片描述

print(torch.log(M))

tensor([[[1.0000, 0.5000],[1.0000, 0.5000]],[[1.3000, 1.5000],[1.8000, 0.5000]],[[0.8000, 1.5000],[1.8000, 0.7000]],[[0.0000, 0.0000],[0.0000, 0.0000]]], dtype=torch.float64)

def Viterbi_M():delta=torch.zeros(3,2)logM=torch.log(M)delta[0]=logM[0,0]torch.max(delta[0].reshape(y_size,1)+logM[1],axis=0)indices=[]for i in range(1,sequence_len):print(delta[i-1].reshape(y_size,1)+logM[i])delta[i],indice=torch.max(delta[i-1].reshape(y_size,1)+logM[i],axis=0)indices.append(indice)        print(delta)
#     print(indices)path=torch.zeros(sequence_len,dtype=torch.int)
#     print(path)path[sequence_len-1]=torch.argmax(delta[sequence_len-1])
#     print(path)for i in range(sequence_len-2,-1,-1):path[i]=indices[i][path[i+1]]
#     print(path)return pathViterbi_M()

tensor([[2.3000, 2.5000],[2.3000, 1.0000]], dtype=torch.float64)
tensor([[3.1000, 3.8000],[4.3000, 3.2000]], dtype=torch.float64)
tensor([[1.0000, 0.5000],[2.3000, 2.5000],[4.3000, 3.8000]])tensor([0, 1, 0], dtype=torch.int32)

3.前向算法

在这里插入图片描述

一般

$α0(y0∣x)={1y0=start=10y0!=start这个是一个值：αi+1(yi+1∣x)=αi(yi∣x)Mi+1(yi+1,yi∣x)\alpha_0(y_0|x)=\begin{cases}1&y_0=start=1\\0&y_0!=start\end{cases}\\ 这个是一个值：\alpha_{i+1}(y_{i+1}|x)=\alpha_i(y_i|x)M_{i+1}(y_{i+1},y_i|x)$

矩阵形式

$这是一个向量：αi(x)=(αi(yi=1∣x)αi(yi=2∣x)...αi(yi=m−1∣x)αi(yi=m∣x))Tαi+1T(x)=αiT(x)Mi+1(x)M1=M[0],M2=M[1],M3=M[2],M4=M[3]这是一个向量：\alpha_i(x)=(\alpha_i(y_i=1|x)\alpha_i(y_i=2|x)...\alpha_i(y_i=m-1|x)\alpha_i(y_i=m|x))^T\\ \alpha_{i+1}^T(x)=\alpha_i^T(x)M_{i+1}(x)\\ M_1=M[0],M_2=M[1],M_3=M[2],M_4=M[3]$

M[0,1,0]=M[0,1,1]=M[3,0,1]=M[3,1,1]=0
def alpha():alpha=torch.zeros(sequence_len+2,y_size,dtype=float)alpha[0,0]=1for i in range(sequence_len+1):alpha[i+1]=torch.matmul(alpha[i].reshape(1,y_size),M[i])print(alpha)return alpha
alpha=alpha()

tensor([[  1.0000,   0.0000],[  2.7183,   1.6487],[ 19.9484,  14.9008],[134.5403, 119.4088],[253.9492,   0.0000]], dtype=torch.float64)

4.后向算法

$βi(yi∣x)=Mi+1(yi,yi+1∣x)βi+1(yi+1∣x)β0(y0∣x)={1yn+1=stop=10yn+1!=stop向量形式：βi(x)=Mi+1(x)βi+1(x)Z(x)=αnT(x)⋅1=1T⋅β1(x)\beta_i(y_i|x)=M_{i+1}(y_i,y_{i+1}|x)\beta_{i+1}(y_{i+1}|x)\\ \beta_0(y_0|x)=\begin{cases}1&y_{n+1}=stop=1\\0&y_{n+1}!=stop\end{cases}\\ 向量形式：\\ \beta_i(x)=M_{i+1}(x)\beta_{i+1}(x)\\ Z(x)=\alpha_n^T(x)·1=1^T·\beta_1(x)$

def beta():beta=torch.zeros(sequence_len+2,y_size,dtype=float)beta[sequence_len+1,0]=1for i in range(sequence_len,-1,-1):
#         print(M[i],beta[i+1].reshape(y_size,1))beta[i]=torch.matmul(M[i],beta[i+1].reshape(y_size,1)).reshape(y_size)print(beta)return beta
beta=beta()

tensor([[253.9492,   0.0000],[ 60.7485,  53.8707],[  6.7072,   8.0634],[  1.0000,   1.0000],[  1.0000,   0.0000]], dtype=torch.float64)

def Z_alpha(alpha):return torch.sum(alpha[sequence_len+1])
print(Z_alpha(alpha))

tensor(253.9492, dtype=torch.float64)

def Z_beta(beta):
#     print(beta)return torch.sum(beta[0])print(Z_beta(betta))

tensor(253.9492, dtype=torch.float64)

5.使用前向后向的概率计算

$这是一个值p(yi∣x)=αiT(yi∣x)βi(yi∣x)Z(x)p(yi−1,yi∣x)=αi−1T(yi∣x)Mi(yi−1,yi∣x)βi(yi∣x)Z(x)这是一个值p(y_i|x)=\frac{\alpha_i^T(y_i|x)\beta_i(y_i|x)}{Z(x)}\\ p(y_{i-1},y_i|x)=\frac{\alpha_{i-1}^T(y_i|x)M_i(y_{i-1},y_i|x)\beta_i(y_i|x)}{Z(x)}$

推导：
$p(yt=i∣x)=Σy1,y2,...,yt−1,yt,...,yTp(y∣x)=Σy1,y2,...,yt−1Σyt,...,yTp(y∣x)=Σy1,y2,...,yt−1Σyt,...,yT1ZΠt′=1TΦt′(yt′−1,yt′,x)=1ZΣy1,y2,...,yt−1Πt′=1tΦt′(yt′−1,yt′,x)Σyt,...,yTΠt′=t+1TΦt′(yt′−1,yt′,x)=1ZΔleftΔrightαt(i)=Δleft=Σy1,y2,...,yt−1Πt′=1tΦt′(yt′−1,yt′,x)=Σy1,y2,...,yt−1Φ1(y0,y1,x)Φ2(y1,y2,x)...Φt−1(yt−2,yt−1,x)Φt(yt−1,yt=i,x)=Σyt−1Φ(yt−1,yt,x)Σyt−2Φ(yt−2,yt−1,x)...Σy1Φ(y1,y2,x)Σy0Φ1(y0,y1,x)βt(i)=Δright=Σyt,...,yTΠt′=t+1TΦt′(yt′−1,yt′,x)p(y_t=i|x)=\Sigma_{y_1,y_2,...,y_{t-1},y_t,...,y_T}p(y|x)\\ =\Sigma_{y_1,y_2,...,y_{t-1}}\Sigma_{y_t,...,y_T}p(y|x)\\ =\Sigma_{y_1,y_2,...,y_{t-1}}\Sigma_{y_t,...,y_T}\frac{1}{Z}\Pi_{t'=1}^T\Phi_{t'}(y_{t'-1},y_{t'},x)\\ =\frac{1}{Z}\Sigma_{y_1,y_2,...,y_{t-1}}\Pi_{t'=1}^{t}\Phi_{t'}(y_{t'-1},y_{t'},x)\Sigma_{y_t,...,y_T}\Pi_{t'=t+1}^T\Phi_{t'}(y_{t'-1},y_{t'},x)\\ =\frac{1}{Z}\Delta_{left}\Delta_{right}\\ \alpha_t(i)=\Delta_{left}=\Sigma_{y_1,y_2,...,y_{t-1}}\Pi_{t'=1}^{t}\Phi_{t'}(y_{t'-1},y_{t'},x)\\ =\Sigma_{y_1,y_2,...,y_{t-1}}\Phi_1(y_0,y_1,x)\Phi_2(y_1,y_2,x)...\Phi_{t-1}(y_{t-2},y_{t-1},x)\Phi_t(y_{t-1},y_t=i,x)\\ =\Sigma_{y_{t-1}}\Phi(y_{t-1},y_{t},x)\Sigma_{y_{t-2}}\Phi(y_{t-2},y_{t-1},x)...\Sigma_{y_1}\Phi(y_1,y_2,x)\Sigma_{y_0}\Phi_1(y_0,y_1,x)\\ \beta_t(i)=\Delta_{right}=\Sigma_{y_t,...,y_T}\Pi_{t'=t+1}^T\Phi_{t'}(y_{t'-1},y_{t'},x)$

def p_y_x_condition_alpha_beta(alpha,beta):#p(y_i|x)p_y_x=alpha*beta/Z_alpha(alpha)#     print(alpha[2].reshape(1,y_size)*beta[2].reshape(y_size,1))return p_y_xy=[0,1,1]
p_y_x_condition_alpha_beta(alpha,beta)

tensor([[1.0000, 0.0000],[0.6503, 0.3497],[0.5269, 0.4731],[0.5298, 0.4702],[1.0000, 0.0000]], dtype=torch.float64)

def p_y12_x_condition_alpha_beta(alpha,beta):p=M.clone().detach()for i in range(sequence_len+1):p[i]=alpha[i].reshape(y_size,1)*p[i]*beta[i+1]return p/Z_alpha(alpha);p_y12_x_condition_alpha_beta(alpha,beta)

tensor([[[0.6503, 0.3497],[0.0000, 0.0000]],[[0.2634, 0.3868],[0.2634, 0.0863]],[[0.1748, 0.3520],[0.3550, 0.1182]],[[0.5298, 0.0000],[0.4702, 0.0000]]], dtype=torch.float64)

6.期望计算

在这里插入图片描述

def E_fk_py_x(k,alpha,beta):#E_{p(y|x)}(f_k)return torch.sum(f[k]*p_y12_x_condition_alpha_beta(alpha,beta))
E_fk_py_x(1,alpha,beta)

tensor(0.1317, dtype=torch.float64)

7.参数估计（学习）

$θ^=argmaxΠi=1Np(y(i)∣x(i))λ^,η^=argmaxλ,ηΠi=1Np(y(i)∣x(i))Σi=1Nlogp(y(i)∣x(i))=Σi=1N(−log(Z)+Σt=1T(λTf(yt−1,yt,x)+ηTg(yt,x)))=L∂L∂λ=Σi=1Nlogp(y(i)∣x(i))=Σi=1N(−∂∂λlog(Z)+Σt=1Tf(yt−1,yt,x))log−partitionfunction:∂∂λlog(Z)=(积分就是期望）E(Σt=1Tf(yt−1,yt,x(i)))=ΣyP(y∣x(i))Σt=1Tf(yt−1,yt,x(i))=Σt=1TΣyP(y∣x(i))f(yt−1,yt,x(i))=Σt=1TΣy1,y2,...,yt−2Σyt−1ΣytΣyt+1,yt+2,...,yTP(y∣x(i))f(yt−1,yt,x(i))=Σt=1TΣyt−1Σyt(Σy1,y2,...,yt−2Σyt+1,yt+2,...,yTP(y∣x(i))f(yt−1,yt,x(i)))=Σt=1TΣyt−1ΣytP(yt−1,yt∣x(i))f(yt−1,yt,x(i))\hat{\theta}=argmax\Pi_{i=1}^N p(y^{(i)}|x^{(i)})\\ \hat{\lambda},\hat{\eta}=argmax_{\lambda,\eta}\Pi_{i=1}^N p(y^{(i)}|x^{(i)})\\ \Sigma_{i=1}^Nlog p(y^{(i)}|x^{(i)})=\Sigma_{i=1}^N(-log(Z)+\Sigma_{t=1}^T(\lambda^Tf(y_{t-1},y_t,x)+\eta^Tg(y_t,x)))\\ =L\\ \frac{\partial L}{\partial \lambda}=\Sigma_{i=1}^Nlog p(y^{(i)}|x^{(i)})=\Sigma_{i=1}^N(-\frac{\partial }{\partial \lambda} log(Z)+\Sigma_{t=1}^Tf(y_{t-1},y_t,x))\\ log-partition function:\\ \frac{\partial }{\partial \lambda} log(Z)\\ =(积分就是期望）E(\Sigma_{t=1}^Tf(y_{t-1},y_t,x^{(i)}))\\ =\Sigma_y P(y|x^{(i)})\Sigma_{t=1}^T f(y_{t-1},y_t,x^{(i)})\\ =\Sigma_{t=1}^T\Sigma_y P(y|x^{(i)}) f(y_{t-1},y_t,x^{(i)})\\ =\Sigma_{t=1}^T\Sigma_{y_1,y_2,...,y_{t-2}}\Sigma_{y_{t-1}}\Sigma_{y_t}\Sigma_{y_{t+1},y_{t+2},...,y_T} P(y|x^{(i)}) f(y_{t-1},y_t,x^{(i)})\\ =\Sigma_{t=1}^T\Sigma_{y_{t-1}}\Sigma_{y_t} (\Sigma_{y_1,y_2,...,y_{t-2}}\Sigma_{y_{t+1},y_{t+2},...,y_T}P(y|x^{(i)}) f(y_{t-1},y_t,x^{(i)}))\\ =\Sigma_{t=1}^T\Sigma_{y_{t-1}}\Sigma_{y_t}P(y_{t-1},y_t|x^{(i)}) f(y_{t-1},y_t,x^{(i)})$
$p(yi−1,yi∣x)=αi−1T(yi∣x)Mi(yi−1,yi∣x)βi(yi∣x)Z(x)p(y_{i-1},y_i|x)=\frac{\alpha_{i-1}^T(y_i|x)M_i(y_{i-1},y_i|x)\beta_i(y_i|x)}{Z(x)}$

7.1 梯度上升

$λt+1=λt+step∗∂L∂ληt+1=ηt+step∗∂L∂η\lambda^{t+1}=\lambda^t+step*\frac{\partial L}{\partial \lambda}\\ \eta^{t+1}=\eta^t+step*\frac{\partial L}{\partial \eta}$

def delta_log_L(self,alpha,beta,y):# print(self.f[:,3,[0,0,1,1],[0,1,1,0]])#y=[0,1,1]delta=torch.sum(self.f[:,len(y),[0]+y,y+[9]],axis=(1))-torch.sum(self.f* self.p_y12_x_condition_alpha_beta(alpha, beta),axis=(1,2,3))return delta

def predict(self,x):self.sequence_len = len(x)self.get_ts(x)self.M = self.f2M()return self.Viterbi_M()def train(self,traindata):delta=0batch_size=100num_batch=int(len(traindata[0])/batch_size)for e in range(num_batch):delta=0for i in range(batch_size):x = traindata[0][e*batch_size+i]y = traindata[1][e*batch_size+i]self.sequence_len =len(x)# print(x)self.get_ts(x)self.M=self.f2M()alpha = self.alpha()beta = self.beta()delta += self.delta_log_L(alpha, beta, y)print(delta)print(self.Viterbi_M())print(y)self.w = self.w + 0.0001 * delta