python machine learning using Doc2Vec (3/3)
python을 이용해서 문장 학습을 시키고 해당 문장이
의문문인지 평문인지에 대한 테스트를 진행해 봅니다.
2편에 이어서 학습된 모델을 가지고 실제 분류확인을 해보겠습니다.
2. edit qna_test.py
from collections import namedtuple
from gensim.models import doc2vec
from konlpy.tag import Twitter
import multiprocessing
from pprint import pprint
from gensim.models import Doc2Vec
from sklearn.linear_model import LogisticRegression
import numpy
import pickle
twitter = Twitter()
def read_data(filename):
with open(filename, 'r', encoding='UTF8') as f:
data = [line.split('\t') for line in f.read().splitlines()]
return data
def tokenize(doc):
# norm, stem은 optional
return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]
# 실제 구동 데이터를 읽기
run_data = read_data('C:/work/python/knlp/data/qna_run.txt')
# 형태소 분류
run_docs = [(tokenize(row[0]), row[1]) for row in run_data[0:]]
# doc2vec 에서 필요한 데이터 형식으로 변경
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_run_docs = [TaggedDocument(d, [c]) for d, c in run_docs]
# load train data
doc_vectorizer = Doc2Vec.load('C:/work/python/knlp/model/qna.model')
# 분류를 위한 피쳐 생성
run_x = [doc_vectorizer.infer_vector(doc.words) for doc in tagged_run_docs]
run_y = [doc.tags[0] for doc in tagged_run_docs]
# load the model from disk
filename = 'C:/work/python/knlp/model/qna.sav'
# 실제 분류 확인
loaded_model = pickle.load(open(filename, 'rb'))
if loaded_model.predict(run_x[0].reshape(1, -1))[0] == 1:
print(run_data[0][0] + " : 질문")
else:
print(run_data[0][0] + " : 평문")
if loaded_model.predict(run_x[1].reshape(1, -1))[0] == 1:
print(run_data[1][0] + " : 질문")
else:
print(run_data[1][0] + " : 평문")
if loaded_model.predict(run_x[2].reshape(1, -1))[0] == 1:
print(run_data[2][0] + " : 질문")
else:
print(run_data[2][0] + " : 평문")
if loaded_model.predict(run_x[3].reshape(1, -1))[0] == 1:
print(run_data[3][0] + " : 질문")
else:
print(run_data[3][0] + " : 평문")
3. result
PS C:\work\python> cd 'c:\work\python'; ${env:PYTHONIOENCODING}='UTF-8'; ${env:PYTHONUNBUFFERED}='1'; & 'C:\Users\sungm\AppData\Local\Programs\Python\Python37\python.exe' 'c:\Users\sungm\.vscode\extensions\ms-python.python-2019.6.24221\pythonFiles\ptvsd_launcher.py' '--default' '--client' '--host' 'localhost' '--port' '62368' 'c:\work\python\knlp\qna_run.py' C:\Users\sungm\AppData\Local\Programs\Python\Python37\lib\site-packages\konlpy\tag\_okt.py:16: UserWarning: "Twitter" has changed to "Okt" since KoNLPy v0.4.5.
warn('"Twitter" has changed to "Okt" since KoNLPy v0.4.5.')
C:\Users\sungm\AppData\Local\Programs\Python\Python37\lib\site-packages\jpype\_core.py:210: UserWarning:
-------------------------------------------------------------------------------
Deprecated: convertStrings was not specified when starting the JVM. The default
behavior in JPype will be False starting in JPype 0.8. The recommended setting
for new code is convertStrings=False. The legacy value of True was assumed for
this session. If you are a user of an application that reported this warning,
please file a ticket with the developer.
-------------------------------------------------------------------------------
""")
오늘 날씨가 좋네요. : 평문
오늘 날씨 어때요? : 질문
모니터 추천 부탁드립니다. : 질문
카카오톡 챗봇 만들기 어렵나요? : 질문
의문문인지 평문인지에 대한 테스트를 진행해 봅니다.
2편에 이어서 학습된 모델을 가지고 실제 분류확인을 해보겠습니다.
1. 환경준비
Windows 10
python 3.7
konlpy
gensim
python 3.7
konlpy
gensim
from collections import namedtuple
from gensim.models import doc2vec
from konlpy.tag import Twitter
import multiprocessing
from pprint import pprint
from gensim.models import Doc2Vec
from sklearn.linear_model import LogisticRegression
import numpy
import pickle
twitter = Twitter()
def read_data(filename):
with open(filename, 'r', encoding='UTF8') as f:
data = [line.split('\t') for line in f.read().splitlines()]
return data
def tokenize(doc):
# norm, stem은 optional
return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]
# 실제 구동 데이터를 읽기
run_data = read_data('C:/work/python/knlp/data/qna_run.txt')
# 형태소 분류
run_docs = [(tokenize(row[0]), row[1]) for row in run_data[0:]]
# doc2vec 에서 필요한 데이터 형식으로 변경
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_run_docs = [TaggedDocument(d, [c]) for d, c in run_docs]
# load train data
doc_vectorizer = Doc2Vec.load('C:/work/python/knlp/model/qna.model')
# 분류를 위한 피쳐 생성
run_x = [doc_vectorizer.infer_vector(doc.words) for doc in tagged_run_docs]
run_y = [doc.tags[0] for doc in tagged_run_docs]
# load the model from disk
filename = 'C:/work/python/knlp/model/qna.sav'
# 실제 분류 확인
loaded_model = pickle.load(open(filename, 'rb'))
if loaded_model.predict(run_x[0].reshape(1, -1))[0] == 1:
print(run_data[0][0] + " : 질문")
else:
print(run_data[0][0] + " : 평문")
if loaded_model.predict(run_x[1].reshape(1, -1))[0] == 1:
print(run_data[1][0] + " : 질문")
else:
print(run_data[1][0] + " : 평문")
if loaded_model.predict(run_x[2].reshape(1, -1))[0] == 1:
print(run_data[2][0] + " : 질문")
else:
print(run_data[2][0] + " : 평문")
if loaded_model.predict(run_x[3].reshape(1, -1))[0] == 1:
print(run_data[3][0] + " : 질문")
else:
print(run_data[3][0] + " : 평문")
3. result
PS C:\work\python> cd 'c:\work\python'; ${env:PYTHONIOENCODING}='UTF-8'; ${env:PYTHONUNBUFFERED}='1'; & 'C:\Users\sungm\AppData\Local\Programs\Python\Python37\python.exe' 'c:\Users\sungm\.vscode\extensions\ms-python.python-2019.6.24221\pythonFiles\ptvsd_launcher.py' '--default' '--client' '--host' 'localhost' '--port' '62368' 'c:\work\python\knlp\qna_run.py' C:\Users\sungm\AppData\Local\Programs\Python\Python37\lib\site-packages\konlpy\tag\_okt.py:16: UserWarning: "Twitter" has changed to "Okt" since KoNLPy v0.4.5.
warn('"Twitter" has changed to "Okt" since KoNLPy v0.4.5.')
C:\Users\sungm\AppData\Local\Programs\Python\Python37\lib\site-packages\jpype\_core.py:210: UserWarning:
-------------------------------------------------------------------------------
Deprecated: convertStrings was not specified when starting the JVM. The default
behavior in JPype will be False starting in JPype 0.8. The recommended setting
for new code is convertStrings=False. The legacy value of True was assumed for
this session. If you are a user of an application that reported this warning,
please file a ticket with the developer.
-------------------------------------------------------------------------------
""")
오늘 날씨가 좋네요. : 평문
오늘 날씨 어때요? : 질문
모니터 추천 부탁드립니다. : 질문
카카오톡 챗봇 만들기 어렵나요? : 질문
Deprecated: convertStrings was not specified when starting the JVM. The default
답글삭제behavior in JPype will be False starting in JPype 0.8. The recommended setting
for new code is convertStrings=False. The legacy value of True was assumed for
this session. If you are a user of an application that reported this warning,
please file a ticket with the developer.
이 오류는 왜뜨는건가요???