Donate. I desperately need donations to survive due to my health

Get paid by answering surveys Click here

Click here to donate

Remote/Work from Home jobs

sklearn-i hava been build models. How can I predict?

Now there is a natural language processing problem. I want to use content date to predict emotion(1=positive,-1=Negative,0=hard to say). I think this is a classification question. Here is part of training data:

enter image description here

Here is my code

import pandas as pd
import numpy as np
import jieba
from jieba import analyse
from sklearn.feature_extraction import DictVectorizer
tfidf = analyse.extract_tags
from collections import Counter
import jieba.posseg as psg
from sklearn import neighbors
data=pd.read_csv("train.csv",encoding="gbk")
data_for_subject=data['subject']
train_answer=data['sentiment_value']
data_for_value=data[['content','sentiment_value']]
list_for_value=data_for_value['content'].tolist()
def count(list1,list2):

    c=Counter()
    for word in list2:
        c[word]+=1
    c=dict(c)
    for w in list(c):
        if w not in list1:
            del c[w]
    return c

def remake(list_for_value):
#return a dict like{"keyword":"How many times did the keyword appear in the sentence","Part of speech":"How many times did the part of speech appear in the sentence"}

    list1=[]
    for i in list_for_value:
        t=[]
        keywords = tfidf(i,topK=10)
        data=jieba.lcut(i)
        c=count(keywords,data)
        for x in psg.cut(i):
            t.append(x.flag)
        want_flag=["c", "v","n","a","p"]
        d=count(want_flag,t)
        c.update(d)
        dict1={'num_keywords':len(t)}
        c.update(dict1)
        list1.append(c)
    return list1

train_dict=remake(list_for_value)
vec = DictVectorizer()
train_array=vec.fit_transform(train_dict).toarray()
from sklearn.model_selection import train_test_split
features_train_v, features_test_v, lables_train_v, lables_test_v = train_test_split(train_array,train_answer,test_size=0.33,random_state=0)
knn=neighbors.KNeighborsClassifier(20,'distance')
knn.fit(features_train_v,lables_train_v)#no problem
knn.score(features_test_v,lables_test_v)#no problem
test=pd.read_csv("test_public.csv",encoding="gbk")
list_for_test=test['content'].tolist()
test_array=remake(list_for_test)
test_f=vec.fit_transform(test_array).toarray()
knn.predict(test_f)#error

ValueError: query data dimension must match training data dimension

What should I do to fix the mistakes and predict?

test_date looks the same as train_date, just some content.

Comments