用户评论情感极性判别

作者 Lei da 日期 2018-07-09
用户评论情感极性判别

本文章介绍百度点石平台上的一个训练赛的赛题代码,赛题是包括用户评论文字的情感判别的分类问题,赛题链接戳此处

数据预处理

使用测试数据和训练数据生成语料库

import numpy as np
import jieba
import codecs
# 该函数作用是读取文件
def load_data(file_path):
data_set = []
with open(file_path, 'r') as lines:
for line in lines:
line=line.strip()
values=line.split("\t")
data_set.append(values)
np.array(data_set)
# print(data_set[0])
return data_set


dataAll=load_data('data_train.csv')
dataTest=load_data('data_test.csv')
csvfile = codecs.open("fenci_result.csv", 'w', 'utf-8')
#f=open('fenci_result.txt','a')
for item in dataAll:
seg_list=jieba.cut(item[2])#使用结巴分词
csvfile.write(" ".join(seg_list))#以空格隔开把分好的词写入文件,形成语料
#f.close()

for item in dataTest:
seg_list=jieba.cut(item[-1])
csvfile.write(" ".join(seg_list))

利用语料库,使用word2vec工具,生成可备用的模型,用于将句子转化为向量

from gensim.models import word2vec
import logging
logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level = logging.INFO)
sentences = word2vec.Text8Corpus("fenci_result.csv") # 加载语料
model = word2vec.Word2Vec(sentences, size = 400) # 训练skip-gram模型

# 保存模型,以便重用
model.save("corpus.model")
model.wv.save_word2vec_format("corpus.model.bin", binary = True)

数据训练与测试

感觉训练方式很简陋,有待改善

#本程序用来测试模型
#coding=utf-8
import re
import numpy as np
import jieba
from gensim.models import word2vec
import logging
import codecs
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score,confusion_matrix, f1_score, precision_score, recall_score, \
roc_curve # 导入指标库
import prettytable # 导入表格库
# 该函数作用是读取文件
def load_data(file_path):
data_set = []
with open(file_path, 'r') as lines:
for line in lines:
line=line.strip()
values=line.split("\t")
data_set.append(values)
np.array(data_set)
# print(data_set[0])
return data_set
#写文件
def write_result(array, outpuFilePath):
with open(outpuFilePath, 'w') as output_file:
for i in range(len(array)):
output_file.write("%d,%d\n" % (i+1,array[i]))
#将句子转化为向量
def getWordVecs(wordList):
vecs = []
for word in wordList:
word = word.strip()
try:
vecs.append(model[word])
except KeyError:
continue

# vecs = np.concatenate(vecs)
return np.array(vecs, dtype = 'float')





model = word2vec.KeyedVectors.load_word2vec_format("corpus.model.bin", binary = True)

# segList=jieba.cut('烤鸭还是不错的,别的菜没什么特殊的')
# resultList = getWordVecs(segList)
# print(sum(np.array(resultList))/2)

dataAll=load_data('data_train.csv')
X=[]
y=[]
dataAll=np.array(dataAll[:1500])
for item in dataAll:
#temp=int(item[-1])

#y.append(temp if temp!=0 else 1)#把0都替换成1,先对2和1进行分类
y.append(int(item[-1]))
segList=jieba.cut(item[2])
vecList=getWordVecs(segList)
if len(vecList) != 0:
X.append(sum(np.array(vecList))/len(vecList))
X=X[:]
x_train=np.array(X)
y_train=np.array(y)
print(x_train)
print(y_train)

# x_train = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2], [2, 1], [3, 2]])
# print(x_train)
# 使用sklearn的PCA进行维度转换
model_pca = PCA(n_components=0.95) # 建立PCA模型对象
model_pca.fit(x_train) # 将数据集输入模型
#model_pca.transform(x_train) # 对数据集进行转换映射
newX=model_pca.fit_transform(x_train)#进行转换映射,并将转换后的赋给newX
components = model_pca.components_ # 获得转换后的所有主成分(不明白什么意思)
components_var = model_pca.explained_variance_ # 获得各主成分的方差
components_var_ratio = model_pca.explained_variance_ratio_ # 获得各主成分的方差占比
print("\n主成分分析:")
print (components) # 打印输出前2个主成分
print (len(components_var)) # 打印输出所有主成分的方差
print (components_var_ratio) # 打印输出所有主成分的方差占比
print(len(newX))
print(len(newX[0]))


X_train, X_test, y_train, y_test = train_test_split(newX, y_train, test_size=.3, random_state=0)
clf = svm.SVC(C=1, kernel='linear',decision_function_shape='ovr')
clf.fit(X_train, y_train)

y_hat=clf.predict(X_test)

##评价指标
accuracy_s = accuracy_score(y_test, y_hat) # 准确率
precision_s = precision_score(y_test, y_hat, average='macro') # 精确度
recall_s = recall_score(y_test, y_hat, average='macro') # 召回率
f1_s = f1_score(y_test, y_hat, average='weighted') # F1得分
print('Accuracy:')
print(accuracy_s)
print('Precision:')
print(precision_s)
print('Recall:')
print(recall_s)
print('f-measure:')
print(f1_s)

##混淆矩阵
confusion_m = confusion_matrix(y_test,y_hat) # 获得混淆矩阵
confusion_matrix_table = prettytable.PrettyTable() # 创建表格实例
confusion_matrix_table.add_row(confusion_m[0, :]) # 增加第一行数据
confusion_matrix_table.add_row(confusion_m[1, :]) # 增加第二行数据
confusion_matrix_table.add_row(confusion_m[2, :]) # 增加第三行数据
print ('confusion matrix')
print (confusion_matrix_table) # 打印输出混淆矩阵



write_result(y_hat,'print.csv')

预测阶段

使用所有训练数据训练模型并对test数据进行预测

#本程序用来进行预测
#coding=utf-8
import re
import numpy as np
import jieba
from gensim.models import word2vec
import logging
import codecs
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn import svm
# 该函数作用是读取文件
def load_data(file_path):
data_set = []
with open(file_path, 'r') as lines:
for line in lines:
line=line.strip()
values=line.split('\t')
data_set.append(values)
np.array(data_set)
# print(data_set[0])
return data_set
#写文件
def write_result(array, outpuFilePath):
with open(outpuFilePath, 'w') as output_file:
for i in range(len(array)):
output_file.write("%d,%d\n" % (i+1,array[i]))

#将句子转化为向量
def getWordVecs(wordList):
vecs = []
for word in wordList:
word = word.strip()
try:
vecs.append(model[word])
except KeyError:
continue

# vecs = np.concatenate(vecs)
return np.array(vecs, dtype = 'float')
#对预测数据进行处理
def preDataHandle():
preData=load_data('data_test.csv')
#exit(0)
xPre=[]
i=0
k=0
for item in preData:
i+=1
s=''
for j in range(len(item)):
if(j>1):
s="%s%s"%(s,item[j])
segList=jieba.cut(s)
vecList=getWordVecs(segList)
if len(vecList) != 0:
xPre.append(sum(np.array(vecList))/len(vecList))
else:
k+=1
print('存在vecList长度为0的情况')
print(item)
x_pre=np.array(xPre)
model_pca = PCA(n_components=factorNum) # 建立PCA模型对象
model_pca.fit(x_pre) # 将数据集输入模型
x_pre=model_pca.fit_transform(x_pre)#进行转换映射
return x_pre



model = word2vec.KeyedVectors.load_word2vec_format("corpus.model.bin", binary = True)
dataAll=load_data('data_train.csv')
X=[]
y=[]
#dataAll=np.array(dataAll[:1500])
for item in dataAll:
print(item)
y.append(int(item[-1]))
segList=jieba.cut(item[2])
vecList=getWordVecs(segList)
if len(vecList) != 0:
X.append(sum(np.array(vecList))/len(vecList))
else:
print(item)
X=X[:]
x_train=np.array(X)
y_train=np.array(y)
model_pca = PCA(n_components=0.95) # 建立PCA模型对象
model_pca.fit(x_train) # 将数据集输入模型
#model_pca.transform(x_train) # 对数据集进行转换映射
newX=model_pca.fit_transform(x_train)#进行转换映射,并将转换后的赋给newX
factorNum=len(newX[0])
clf = svm.SVC(C=1, kernel='linear',decision_function_shape='ovr')
clf.fit(newX, y_train)

x_pre=preDataHandle()
y_pre=clf.predict(x_pre)

write_result(y_pre,'output.csv')
print('Project has been finished successfully!')

比赛平台上计算出的结果f1-score为0.7249,很低,希望再接再厉