Hỏi đáp

Chia sẻ kiến thức, cùng nhau phát triển

Đặt câu hỏi

thay đổi phần [Test] nhưng vẫn giữ nguyên phần [Train]

00:02 22-11-2017 888 lượt xem 3 bình luận 04:02 22-11-2017

python-3.x machine-learning

import os, sys
import nltk
import numpy as np
import matplotlib.pyplot as plt

DJANGO_PATH_TOKENIZER = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'VnTokenizer/scripts')
sys.path.append(DJANGO_PATH_TOKENIZER)


from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from bs4 import BeautifulSoup
from vn_tokenizer import runTokenizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

wordnet_lemmatizer = WordNetLemmatizer()


stopwords = set(w.rstrip() for w in open('stopwords.txt'))

# load the reviews
# data courtesy of http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html
#positive_reviews = BeautifulSoup(open('hotel/positive.review').read(),"lxml")

with open('hotel/positive.review', encoding='utf8') as infile:
    positive_reviews = BeautifulSoup(infile, "html.parser")

positive_reviews = positive_reviews.findAll('review_text')

#negative_reviews = BeautifulSoup(open('hotel/negative.review').read(),"lxml")

with open('hotel/negative.review', encoding='utf8') as infile:
    negative_reviews = BeautifulSoup(infile, "html.parser")

negative_reviews = negative_reviews.findAll('review_text')

# there are more positive reviews than negative reviews
# so let's take a random sample so we have balanced classes
np.random.shuffle(positive_reviews)
positive_reviews = positive_reviews[:len(negative_reviews)]

def my_tokenizer(s):
    s = s.lower() # downcase
    #tokens = nltk.tokenize.word_tokenize(s) # split string into words (tokens)
    #print(tokens)
    tokens = runTokenizer(s)
    tokens = [t for t in tokens if len(t) > 1] # remove short words, they're probably not useful
    #tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
    tokens = [t for t in tokens if t not in stopwords] # remove stopwords
    return tokens

# create a word-to-index map so that we can create our word-frequency vectors later
# let's also save the tokenized versions so we don't have to tokenize again later
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []

for review in positive_reviews:
    #print (review.text)
    tokens = my_tokenizer(review.text)
    print(tokens)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

for review in negative_reviews:
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1


# now let's create our input matrices
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1) # last element is for the label
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum() # normalize it before setting label
    x[-1] = label
    return x

N = len(positive_tokenized) + len(negative_tokenized)
# (N x D+1 matrix - keeping them together for now so we can shuffle more easily later
data = np.zeros((N, len(word_index_map) + 1))
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1

# shuffle the data and create train/test splits
# try it multiple times!
np.random.shuffle(data)
data = data[~np.isnan(data).any(axis=1)]
#X = data[:,:-1]
#Y = data[:,-1]

#Xtrain = X[:,]
#Ytrain = Y[:,]
#Xtest = X[:,]
#Ytest = Y[:,]

model = LogisticRegression()
model.fit(Xtrain, Ytrain)
y_pred = model.predict(Xtest)
print (f1_score(Ytest, y_pred, average="macro"))
print (precision_score(Ytest, y_pred, average="macro"))
print (recall_score(Ytest, y_pred, average="macro"))
print (accuracy_score(Ytest,y_pred,normalize=True,sample_weight=None ))

Mình có file text gồm 100 dòng để làm dữ liệu cho phần test

Mình muốn thay thế phần test trong source bằng phần test của mình thì phải làm thế nào

Bình luận

Để bình luận, bạn cần đăng nhập bằng tài khoản Howkteam.

Đăng nhập

lehongtuan261295 đã bình luận 04:02 22-11-2017

Mình có nghĩ ra 1 số cách thay thế cái Xtest như đều bị lỗi x has 785 features per sample; expecting 9571. Mình k biết xử lí ntn nữa

0 0

Trả lời Báo cáo

K9 SuperAdmin, KquizAdmin, KquizAuthor đã bình luận 00:04 22-11-2017

ý bạn là thao tác tay hay code? tay thì coppy paste. câu hỏi của bạn còn chưa rõ ràng

0 0

Trả lời Báo cáo

Hỏi đáp

Chia sẻ kiến thức, cùng nhau phát triển

thay đổi phần [Test] nhưng vẫn giữ nguyên phần [Train]

Bình luận

Câu hỏi mới nhất

AD BLOCKER DETECTED