Now there is a natural language processing problem. I want to use content date to predict emotion(1=positive,-1=Negative,0=hard to say). I think this is a classification question. Here is part of training data:
Here is my code
import pandas as pd
import numpy as np
import jieba
from jieba import analyse
from sklearn.feature_extraction import DictVectorizer
tfidf = analyse.extract_tags
from collections import Counter
import jieba.posseg as psg
from sklearn import neighbors
data=pd.read_csv("train.csv",encoding="gbk")
data_for_subject=data['subject']
train_answer=data['sentiment_value']
data_for_value=data[['content','sentiment_value']]
list_for_value=data_for_value['content'].tolist()
def count(list1,list2):
c=Counter()
for word in list2:
c[word]+=1
c=dict(c)
for w in list(c):
if w not in list1:
del c[w]
return c
def remake(list_for_value):
#return a dict like{"keyword":"How many times did the keyword appear in the sentence","Part of speech":"How many times did the part of speech appear in the sentence"}
list1=[]
for i in list_for_value:
t=[]
keywords = tfidf(i,topK=10)
data=jieba.lcut(i)
c=count(keywords,data)
for x in psg.cut(i):
t.append(x.flag)
want_flag=["c", "v","n","a","p"]
d=count(want_flag,t)
c.update(d)
dict1={'num_keywords':len(t)}
c.update(dict1)
list1.append(c)
return list1
train_dict=remake(list_for_value)
vec = DictVectorizer()
train_array=vec.fit_transform(train_dict).toarray()
from sklearn.model_selection import train_test_split
features_train_v, features_test_v, lables_train_v, lables_test_v = train_test_split(train_array,train_answer,test_size=0.33,random_state=0)
knn=neighbors.KNeighborsClassifier(20,'distance')
knn.fit(features_train_v,lables_train_v)#no problem
knn.score(features_test_v,lables_test_v)#no problem
test=pd.read_csv("test_public.csv",encoding="gbk")
list_for_test=test['content'].tolist()
test_array=remake(list_for_test)
test_f=vec.fit_transform(test_array).toarray()
knn.predict(test_f)#error
ValueError: query data dimension must match training data dimension
What should I do to fix the mistakes and predict?
test_date
looks the same as train_date
, just some content.
Comments
Post a Comment