Data Analysis & ML/Text Mining

[Text Mining][텍스트마이닝][NLP] IMDB(Internet Movie Database) Text Mining

YSY^ 2020. 9. 15. 18:53

IMDB(Internet Movie Database) 영화리뷰 데이터 셋

from sklearn.datasets import load_files
import numpy as np
import pandas as pd
  • neg: 0, pos: 1 로 분리해 준다. (폴더의 알파벳 순서대로 )
  • Bunch 타입으로 반환
review_train = load_files("aclImdb/train")  #분류 클래스 별로 폴더를 만들고 그 폴더에 document text들을 저장.
review_test = load_files('aclImdb/test')

type(review_train), review_train.keys()
# (sklearn.utils.Bunch, dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR']))

# label/y
type(review_train.data), type(review_train.target)
# (list, numpy.ndarray)

 

 

전처리

  • <br/> 제거
  • binary string을 string으로 변환
import nltk
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

def text_preprocessing(documents):

    stop_words = stopwords.words('english')
    stemmer = SnowballStemmer('english')

    return_list = []
    for doc in documents:
        # binary string => string
        doc = doc.decode('utf-8')
        # 소문자로 변환
        doc = doc.lower()
        # br태그 제거 (공백으로 변환)
        doc = doc.replace('<br />', ' ')
        # 토큰화
        tokens = nltk.regexp_tokenize(doc, r'[A-Za-z]+')
        #stopword 제거 + stemming
        tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
        #list를 다시 string을 변환
        return_list.append(' '.join(tokens))

    return return_list

 

X_train = text_preprocessing(review_train.data)
y_train = review_train.target

X_test = text_preprocessing(review_test.data)
y_test = review_test.target

전처리한 데이터셋 파일로 저장

import os
if not os.path.isdir('imdb_text_preprocess_data'): #디렉토리가 없다면
    os.mkdir('imdb_text_preprocess_data')         #디렉토리를 만들어라.

import pickle
# X_train
with open('imdb_text_preprocess_data/X_train.pkl', 'wb') as f:
    pickle.dump(X_train, f)

# y_train
with open('imdb_text_preprocess_data/y_train.pkl', 'wb') as f:
    pickle.dump(y_train, f)

# X_test
with open('imdb_text_preprocess_data/X_test.pkl', 'wb') as f:
    pickle.dump(X_test, f)


# y_test
with open('imdb_text_preprocess_data/y_test.pkl', 'wb') as f:
    pickle.dump(y_test, f)

데이터셋 읽기

# X_train, y_train, X_test, y_test 읽는 코드.   

import pickle
with open("imdb_text_preprocess_data/X_train.pkl", 'rb') as f:
    X_train2 = pickle.load(f)

with open("imdb_text_preprocess_data/y_train.pkl", 'rb') as f:
    y_train2 = pickle.load(f)

with open("imdb_text_preprocess_data/X_test.pkl", 'rb') as f:
    X_test2 = pickle.load(f)    

with open("imdb_text_preprocess_data/y_test.pkl", 'rb') as f:
    y_test2 = pickle.load(f)  

 

영화리뷰 데이터 Feature Vectorization

DTM

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
# cv = CountVectorizer(max_features=10000)
cv.fit(X_train)

len(cv.get_feature_names()) # 단어(term) 수 (48956)

len(X_train) #문서수 (25000)

# 변환
X_train_cv = cv.transform(X_train)
X_test_cv = cv.transform(X_test)

X_train_cv.shape, X_test_cv.shape # ((25000, 48956), (25000, 48956))

머신러닝 알고리즘을 이용해 긍부정 분류

로지스틱회귀

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=2000)
lr.fit(X_train_cv, y_train)

pred_train = lr.predict(X_train_cv)
pred_test = lr.predict(X_test_cv)

from sklearn.metrics import accuracy_score
accuracy_score(y_train, pred_train), accuracy_score(y_test, pred_test)
# (0.99408, 0.85264)

랜덤포레스트

from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(max_depth=3, n_estimators=200)
rf_clf.fit(X_train_cv, y_train)

pred_train = rf_clf.predict(X_train_cv)
pred_test = rf_clf.predict(X_test_cv)

accuracy_score(y_train, pred_train), accuracy_score(y_test, pred_test)
# (0.83108, 0.82284)

 

TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

tfidf.fit(X_train)

train_tfidf = tfidf.transform(X_train)
test_tfidf = tfidf.transform(X_test)

train_tfidf.shape, test_tfidf.shape # ((25000, 48956), (25000, 48956))

머신러닝 알고리즘을 이용해 긍부정 분류

로지스틱회귀

lr = LogisticRegression(max_iter=2000)
lr.fit(train_tfidf, y_train)

pred_train = lr.predict(train_tfidf)
pred_test = lr.predict(test_tfidf)

accuracy_score(y_train, pred_train), accuracy_score(y_test, pred_test)
# (0.92912, 0.87968)

랜덤포레스트

rf_clf = RandomForestClassifier(max_depth=3, n_estimators=200)
rf_clf.fit(train_tfidf, y_train)

pred_train = rf_clf.predict(train_tfidf)
pred_test = rf_clf.predict(test_tfidf)

accuracy_score(y_train, pred_train), accuracy_score(y_test, pred_test)
(0.8282, 0.81512)
728x90
반응형