import os, sys
import nltk
import numpy as np
import matplotlib.pyplot as plt
DJANGO_PATH_TOKENIZER = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'VnTokenizer/scripts')
sys.path.append(DJANGO_PATH_TOKENIZER)
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from bs4 import BeautifulSoup
from vn_tokenizer import runTokenizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
wordnet_lemmatizer = WordNetLemmatizer()
stopwords = set(w.rstrip() for w in open('stopwords.txt'))
with open('hotel/positive.review', encoding='utf8') as infile:
positive_reviews = BeautifulSoup(infile, "html.parser")
positive_reviews = positive_reviews.findAll('review_text')
with open('hotel/negative.review', encoding='utf8') as infile:
negative_reviews = BeautifulSoup(infile, "html.parser")
negative_reviews = negative_reviews.findAll('review_text')
np.random.shuffle(positive_reviews)
positive_reviews = positive_reviews[:len(negative_reviews)]
def my_tokenizer(s):
s = s.lower()
tokens = runTokenizer(s)
tokens = [t for t in tokens if len(t) > 1]
tokens = [t for t in tokens if t not in stopwords]
return tokens
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []
for review in positive_reviews:
tokens = my_tokenizer(review.text)
print(tokens)
positive_tokenized.append(tokens)
for token in tokens:
if token not in word_index_map:
word_index_map[token] = current_index
current_index += 1
for review in negative_reviews:
tokens = my_tokenizer(review.text)
negative_tokenized.append(tokens)
for token in tokens:
if token not in word_index_map:
word_index_map[token] = current_index
current_index += 1
def tokens_to_vector(tokens, label):
x = np.zeros(len(word_index_map) + 1)
for t in tokens:
i = word_index_map[t]
x[i] += 1
x = x / x.sum()
x[-1] = label
return x
N = len(positive_tokenized) + len(negative_tokenized)
data = np.zeros((N, len(word_index_map) + 1))
i = 0
for tokens in positive_tokenized:
xy = tokens_to_vector(tokens, 1)
data[i,:] = xy
i += 1
for tokens in negative_tokenized:
xy = tokens_to_vector(tokens, 0)
data[i,:] = xy
i += 1
np.random.shuffle(data)
data = data[~np.isnan(data).any(axis=1)]
model = LogisticRegression()
model.fit(Xtrain, Ytrain)
y_pred = model.predict(Xtest)
print (f1_score(Ytest, y_pred, average="macro"))
print (precision_score(Ytest, y_pred, average="macro"))
print (recall_score(Ytest, y_pred, average="macro"))
print (accuracy_score(Ytest,y_pred,normalize=True,sample_weight=None ))
Mình có file text gồm 100 dòng để làm dữ liệu cho phần test
Mình muốn thay thế phần test trong source bằng phần test của mình thì phải làm thế nào
Mình có nghĩ ra 1 số cách thay thế cái Xtest như đều bị lỗi x has 785 features per sample; expecting 9571. Mình k biết xử lí ntn nữa
ý bạn là thao tác tay hay code? tay thì coppy paste. câu hỏi của bạn còn chưa rõ ràng