IMDB(Internet Movie Database) 영화리뷰 데이터 셋
-
다운로드: http://ai.stanford.edu/~amaas/data/sentiment/
- train의 unsup 은 제거 (비지도학습용)
-
load_files()
- 분류범주를 폴더로 분리한 텍스트 파일을 load한다.
- Bunch 타입으로 반환
from sklearn.datasets import load_files
import numpy as np
import pandas as pd
- neg: 0, pos: 1 로 분리해 준다. (폴더의 알파벳 순서대로 )
- Bunch 타입으로 반환
review_train = load_files("aclImdb/train") #분류 클래스 별로 폴더를 만들고 그 폴더에 document text들을 저장.
review_test = load_files('aclImdb/test')
type(review_train), review_train.keys()
# (sklearn.utils.Bunch, dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR']))
# label/y
type(review_train.data), type(review_train.target)
# (list, numpy.ndarray)
전처리
<br/>
제거- binary string을 string으로 변환
import nltk
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
def text_preprocessing(documents):
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')
return_list = []
for doc in documents:
# binary string => string
doc = doc.decode('utf-8')
# 소문자로 변환
doc = doc.lower()
# br태그 제거 (공백으로 변환)
doc = doc.replace('<br />', ' ')
# 토큰화
tokens = nltk.regexp_tokenize(doc, r'[A-Za-z]+')
#stopword 제거 + stemming
tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
#list를 다시 string을 변환
return_list.append(' '.join(tokens))
return return_list
X_train = text_preprocessing(review_train.data)
y_train = review_train.target
X_test = text_preprocessing(review_test.data)
y_test = review_test.target
전처리한 데이터셋 파일로 저장
import os
if not os.path.isdir('imdb_text_preprocess_data'): #디렉토리가 없다면
os.mkdir('imdb_text_preprocess_data') #디렉토리를 만들어라.
import pickle
# X_train
with open('imdb_text_preprocess_data/X_train.pkl', 'wb') as f:
pickle.dump(X_train, f)
# y_train
with open('imdb_text_preprocess_data/y_train.pkl', 'wb') as f:
pickle.dump(y_train, f)
# X_test
with open('imdb_text_preprocess_data/X_test.pkl', 'wb') as f:
pickle.dump(X_test, f)
# y_test
with open('imdb_text_preprocess_data/y_test.pkl', 'wb') as f:
pickle.dump(y_test, f)
데이터셋 읽기
# X_train, y_train, X_test, y_test 읽는 코드.
import pickle
with open("imdb_text_preprocess_data/X_train.pkl", 'rb') as f:
X_train2 = pickle.load(f)
with open("imdb_text_preprocess_data/y_train.pkl", 'rb') as f:
y_train2 = pickle.load(f)
with open("imdb_text_preprocess_data/X_test.pkl", 'rb') as f:
X_test2 = pickle.load(f)
with open("imdb_text_preprocess_data/y_test.pkl", 'rb') as f:
y_test2 = pickle.load(f)
영화리뷰 데이터 Feature Vectorization
DTM
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
# cv = CountVectorizer(max_features=10000)
cv.fit(X_train)
len(cv.get_feature_names()) # 단어(term) 수 (48956)
len(X_train) #문서수 (25000)
# 변환
X_train_cv = cv.transform(X_train)
X_test_cv = cv.transform(X_test)
X_train_cv.shape, X_test_cv.shape # ((25000, 48956), (25000, 48956))
머신러닝 알고리즘을 이용해 긍부정 분류
로지스틱회귀
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=2000)
lr.fit(X_train_cv, y_train)
pred_train = lr.predict(X_train_cv)
pred_test = lr.predict(X_test_cv)
from sklearn.metrics import accuracy_score
accuracy_score(y_train, pred_train), accuracy_score(y_test, pred_test)
# (0.99408, 0.85264)
랜덤포레스트
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(max_depth=3, n_estimators=200)
rf_clf.fit(X_train_cv, y_train)
pred_train = rf_clf.predict(X_train_cv)
pred_test = rf_clf.predict(X_test_cv)
accuracy_score(y_train, pred_train), accuracy_score(y_test, pred_test)
# (0.83108, 0.82284)
TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf.fit(X_train)
train_tfidf = tfidf.transform(X_train)
test_tfidf = tfidf.transform(X_test)
train_tfidf.shape, test_tfidf.shape # ((25000, 48956), (25000, 48956))
머신러닝 알고리즘을 이용해 긍부정 분류
로지스틱회귀
lr = LogisticRegression(max_iter=2000)
lr.fit(train_tfidf, y_train)
pred_train = lr.predict(train_tfidf)
pred_test = lr.predict(test_tfidf)
accuracy_score(y_train, pred_train), accuracy_score(y_test, pred_test)
# (0.92912, 0.87968)
랜덤포레스트
rf_clf = RandomForestClassifier(max_depth=3, n_estimators=200)
rf_clf.fit(train_tfidf, y_train)
pred_train = rf_clf.predict(train_tfidf)
pred_test = rf_clf.predict(test_tfidf)
accuracy_score(y_train, pred_train), accuracy_score(y_test, pred_test)
(0.8282, 0.81512)
728x90
반응형
'Data Analysis & ML > Text Mining' 카테고리의 다른 글
[Text Mining][텍스트마이닝][NLP] Feature Vectorize(TF-IDF, TfidfVectorizer) (0) | 2020.09.15 |
---|---|
[Text Mining][텍스트마이닝][NLP] Feature Vectorize(DTM/TDM, CountVectorizer) (0) | 2020.09.15 |
[Text Mining][텍스트마이닝][NLP] KoNLPy (0) | 2020.09.13 |
[Text Mining][텍스트마이닝][NLP] 텍스트 전처리 프로세스 (0) | 2020.09.13 |
[Text Mining][텍스트마이닝][NLP] NLTK 패키지(형태소, 어간,품사부착,원형복원) (0) | 2020.09.09 |