python machine learning using Doc2Vec (2/3)
python을 이용해서 문장 학습을 시키고 해당 문장이
의문문인지 평문인지에 대한 테스트를 진행해 봅니다.
1편에 이어서 학습된 데이터를 바탕으로 학습 모델을 만듭니다.
2. edit qna_test.py
from collections import namedtuple
from gensim.models import doc2vec
from konlpy.tag import Twitter
import multiprocessing
from pprint import pprint
from gensim.models import Doc2Vec
from sklearn.linear_model import LogisticRegression
import numpy
import pickle
twitter = Twitter()
def read_data(filename):
with open(filename, 'r', encoding='UTF8') as f:
data = [line.split('\t') for line in f.read().splitlines()]
return data
def tokenize(doc):
# norm, stem은 optional
return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]
# 테스트 데이터를 읽기
train_data = read_data('C:/work/python/knlp/data/qna_train.txt')
test_data = read_data('C:/work/python/knlp/data/qna_test.txt')
# 형태소 분류
train_docs = [(tokenize(row[0]), row[1]) for row in train_data[0:]]
test_docs = [(tokenize(row[0]), row[1]) for row in test_data[0:]]
# doc2vec 에서 필요한 데이터 형식으로 변경
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]
tagged_test_docs = [TaggedDocument(d, [c]) for d, c in test_docs]
# load train data
doc_vectorizer = Doc2Vec.load('C:/work/python/knlp/model/qna.model')
# 분류를 위한 피쳐 생성
train_x = [doc_vectorizer.infer_vector(doc.words) for doc in tagged_train_docs]
train_y = [doc.tags[0] for doc in tagged_train_docs]
test_x = [doc_vectorizer.infer_vector(doc.words) for doc in tagged_test_docs]
test_y = [doc.tags[0] for doc in tagged_test_docs]
#classifier = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)
classifier = LogisticRegression(random_state=1234)
classifier.fit(train_x, train_y)
# 테스트 socre 확인
print( classifier.score(test_x, test_y) )
# 0.63904
# save the model to disk
filename = 'C:/work/python/knlp/model/qna.sav'
pickle.dump(classifier, open(filename, 'wb'))
3. exec qna_test.py
PS C:\work\python> cd 'c:\work\python'; ${env:PYTHONIOENCODING}='UTF-8'; ${env:PYTHONUNBUFFERED}='1'; & 'C:\Users\sungm\AppData\Local\Programs\Python\Python37\python.exe' 'c:\Users\sungm\.vscode\extensions\ms-python.python-2019.6.24221\pythonFiles\ptvsd_launcher.py' '--default' '--client' '--host' 'localhost' '--port' '52582' 'c:\work\python\knlp\qna_test.py' C:\Users\sungm\AppData\Local\Programs\Python\Python37\lib\site-packages\konlpy\tag\_okt.py:16: UserWarning: "Twitter" has changed to "Okt" since KoNLPy v0.4.5.
warn('"Twitter" has changed to "Okt" since KoNLPy v0.4.5.')
C:\Users\sungm\AppData\Local\Programs\Python\Python37\lib\site-packages\jpype\_core.py:210: UserWarning:
-------------------------------------------------------------------------------
Deprecated: convertStrings was not specified when starting the JVM. The default
behavior in JPype will be False starting in JPype 0.8. The recommended setting
for new code is convertStrings=False. The legacy value of True was assumed for
this session. If you are a user of an application that reported this warning,
please file a ticket with the developer.
-------------------------------------------------------------------------------
""")
C:\Users\sungm\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\linear_model\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
FutureWarning)
0.2958199356913183
PS C:\work\python>
의문문인지 평문인지에 대한 테스트를 진행해 봅니다.
1편에 이어서 학습된 데이터를 바탕으로 학습 모델을 만듭니다.
1. 환경준비
Windows 10
python 3.7
konlpy
gensim
python 3.7
konlpy
gensim
from collections import namedtuple
from gensim.models import doc2vec
from konlpy.tag import Twitter
import multiprocessing
from pprint import pprint
from gensim.models import Doc2Vec
from sklearn.linear_model import LogisticRegression
import numpy
import pickle
twitter = Twitter()
def read_data(filename):
with open(filename, 'r', encoding='UTF8') as f:
data = [line.split('\t') for line in f.read().splitlines()]
return data
def tokenize(doc):
# norm, stem은 optional
return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]
# 테스트 데이터를 읽기
train_data = read_data('C:/work/python/knlp/data/qna_train.txt')
test_data = read_data('C:/work/python/knlp/data/qna_test.txt')
# 형태소 분류
train_docs = [(tokenize(row[0]), row[1]) for row in train_data[0:]]
test_docs = [(tokenize(row[0]), row[1]) for row in test_data[0:]]
# doc2vec 에서 필요한 데이터 형식으로 변경
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]
tagged_test_docs = [TaggedDocument(d, [c]) for d, c in test_docs]
# load train data
doc_vectorizer = Doc2Vec.load('C:/work/python/knlp/model/qna.model')
# 분류를 위한 피쳐 생성
train_x = [doc_vectorizer.infer_vector(doc.words) for doc in tagged_train_docs]
train_y = [doc.tags[0] for doc in tagged_train_docs]
test_x = [doc_vectorizer.infer_vector(doc.words) for doc in tagged_test_docs]
test_y = [doc.tags[0] for doc in tagged_test_docs]
#classifier = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)
classifier = LogisticRegression(random_state=1234)
classifier.fit(train_x, train_y)
# 테스트 socre 확인
print( classifier.score(test_x, test_y) )
# 0.63904
# save the model to disk
filename = 'C:/work/python/knlp/model/qna.sav'
pickle.dump(classifier, open(filename, 'wb'))
3. exec qna_test.py
PS C:\work\python> cd 'c:\work\python'; ${env:PYTHONIOENCODING}='UTF-8'; ${env:PYTHONUNBUFFERED}='1'; & 'C:\Users\sungm\AppData\Local\Programs\Python\Python37\python.exe' 'c:\Users\sungm\.vscode\extensions\ms-python.python-2019.6.24221\pythonFiles\ptvsd_launcher.py' '--default' '--client' '--host' 'localhost' '--port' '52582' 'c:\work\python\knlp\qna_test.py' C:\Users\sungm\AppData\Local\Programs\Python\Python37\lib\site-packages\konlpy\tag\_okt.py:16: UserWarning: "Twitter" has changed to "Okt" since KoNLPy v0.4.5.
warn('"Twitter" has changed to "Okt" since KoNLPy v0.4.5.')
C:\Users\sungm\AppData\Local\Programs\Python\Python37\lib\site-packages\jpype\_core.py:210: UserWarning:
-------------------------------------------------------------------------------
Deprecated: convertStrings was not specified when starting the JVM. The default
behavior in JPype will be False starting in JPype 0.8. The recommended setting
for new code is convertStrings=False. The legacy value of True was assumed for
this session. If you are a user of an application that reported this warning,
please file a ticket with the developer.
-------------------------------------------------------------------------------
""")
C:\Users\sungm\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\linear_model\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
FutureWarning)
0.2958199356913183
PS C:\work\python>
댓글
댓글 쓰기