python machine learning using Doc2Vec (1/3)
python을 이용해서 문장 학습을 시키고 해당 문장이
의문문인지 평문인지에 대한 테스트를 진행해 봅니다.
2. edit qna_train.py
from collections import namedtuple
from gensim.models import doc2vec
from konlpy.tag import Twitter
import multiprocessing
from pprint import pprint
twitter = Twitter()
def read_data(filename):
with open(filename, 'r', encoding='UTF8') as f:
data = [line.split('\t') for line in f.read().splitlines()]
return data
def tokenize(doc):
# norm, stem은 optional
return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]
#doc2vec parameters
cores = multiprocessing.cpu_count()
vector_size = 300
window_size = 15
word_min_count = 2
sampling_threshold = 1e-5
negative_size = 5
train_epoch = 100
dm = 1
worker_count = cores
# 트래이닝 데이터 읽기
train_data = read_data('C:/work/python/knlp/data/qna_train.txt')
# 형태소 분류
train_docs = [(tokenize(row[0]), row[1]) for row in train_data[0:]]
# doc2vec 에서 필요한 데이터 형식으로 변경
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]
# 사전 구축
doc_vectorizer = doc2vec.Doc2Vec(size=300, alpha=0.025, min_alpha=0.025, seed=1234)
doc_vectorizer.build_vocab(tagged_train_docs)
# Train document vectors!
for epoch in range(10):
doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
doc_vectorizer.alpha -= 0.002 # decrease the learning rate
doc_vectorizer.min_alpha = doc_vectorizer.alpha # fix the learning rate, no decay
#To save
doc_vectorizer.save('C:/work/python/knlp/model/qna.model')
pprint(doc_vectorizer.most_similar('스위치/Noun'))
pprint(doc_vectorizer.similarity('스위치/Noun', 'ㅋㅋ/KoreanParticle'))
<sample data qna_train.txt>
제주 렌트카 보험 관련 질문 1
차이슨~~ 청소기 괜츈한가요?? 1
스마트TV 추천 좀 해주세요~~~ 1
3. exec qna_train.py
PS C:\work\python> cd 'c:\work\python'; ${env:PYTHONIOENCODING}='UTF-8'; ${env:PYTHONUNBUFFERED}='1'; & 'C:\Users\sungm\AppData\Local\Programs\Python\Python37\python.exe' 'c:\Users\sungm\.vscode\extensions\ms-python.python-2019.6.24221\pythonFiles\ptvsd_launcher.py' '--default' '--client' '--host' 'localhost' '--port' '49299' 'c:\work\python\knlp\qna_train.py'
C:\Users\sungm\AppData\Local\Programs\Python\Python37\lib\site-packages\konlpy\tag\_okt.py:16: UserWarning: "Twitter" has changed to "Okt" since KoNLPy v0.4.5.
warn('"Twitter" has changed to "Okt" since KoNLPy v0.4.5.')
C:\Users\sungm\AppData\Local\Programs\Python\Python37\lib\site-packages\jpype\_core.py:210: UserWarning:
-------------------------------------------------------------------------------
Deprecated: convertStrings was not specified when starting the JVM. The default
behavior in JPype will be False starting in JPype 0.8. The recommended setting
for new code is convertStrings=False. The legacy value of True was assumed for
this session. If you are a user of an application that reported this warning,
please file a ticket with the developer.
-------------------------------------------------------------------------------
""")
C:\Users\sungm\AppData\Local\Programs\Python\Python37\lib\site-packages\gensim\models\doc2vec.py:574: UserWarning: The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.
warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.")
c:\work\python\knlp\qna_train.py:49: DeprecationWarning: Call to deprecated `iter` (Attribute will be removed in 4.0.0, use self.epochs instead).
doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
c:\work\python\knlp\qna_train.py:56: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
pprint(doc_vectorizer.most_similar('스위치/Noun'))
[('DS/Alpha', 0.4983493685722351),
('닌텐도/Noun', 0.48084473609924316),
('플스/Noun', 0.4436195194721222),
('전선/Noun', 0.4293632507324219),
('PS/Alpha', 0.4234338104724884),
('플러그/Noun', 0.422281950712204),
('디아블로/Noun', 0.4107228219509125),
('컴터/Noun', 0.4076959490776062),
('스팀/Noun', 0.40483903884887695),
('피파/Noun', 0.3986400365829468)]
c:\work\python\knlp\qna_train.py:57: DeprecationWarning: Call to deprecated `similarity` (Method will be removed in 4.0.0, use self.wv.similarity() instead).
pprint(doc_vectorizer.similarity('스위치/Noun', 'ㅋㅋ/KoreanParticle'))
0.051481076
PS C:\work\python>
<<https://github.com/hoho0443/classify_comment_emotion>> source 참조
의문문인지 평문인지에 대한 테스트를 진행해 봅니다.
1. 환경준비
Windows 10
python 3.7
konlpy
gensim
python 3.7
konlpy
gensim
from collections import namedtuple
from gensim.models import doc2vec
from konlpy.tag import Twitter
import multiprocessing
from pprint import pprint
twitter = Twitter()
def read_data(filename):
with open(filename, 'r', encoding='UTF8') as f:
data = [line.split('\t') for line in f.read().splitlines()]
return data
def tokenize(doc):
# norm, stem은 optional
return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]
#doc2vec parameters
cores = multiprocessing.cpu_count()
vector_size = 300
window_size = 15
word_min_count = 2
sampling_threshold = 1e-5
negative_size = 5
train_epoch = 100
dm = 1
worker_count = cores
# 트래이닝 데이터 읽기
train_data = read_data('C:/work/python/knlp/data/qna_train.txt')
# 형태소 분류
train_docs = [(tokenize(row[0]), row[1]) for row in train_data[0:]]
# doc2vec 에서 필요한 데이터 형식으로 변경
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]
# 사전 구축
doc_vectorizer = doc2vec.Doc2Vec(size=300, alpha=0.025, min_alpha=0.025, seed=1234)
doc_vectorizer.build_vocab(tagged_train_docs)
# Train document vectors!
for epoch in range(10):
doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
doc_vectorizer.alpha -= 0.002 # decrease the learning rate
doc_vectorizer.min_alpha = doc_vectorizer.alpha # fix the learning rate, no decay
#To save
doc_vectorizer.save('C:/work/python/knlp/model/qna.model')
pprint(doc_vectorizer.most_similar('스위치/Noun'))
pprint(doc_vectorizer.similarity('스위치/Noun', 'ㅋㅋ/KoreanParticle'))
제주 렌트카 보험 관련 질문 1
차이슨~~ 청소기 괜츈한가요?? 1
스마트TV 추천 좀 해주세요~~~ 1
<sample data qna_test.txt>
벤큐 빔프로젝터 모델 추천 좀 부탁드립니다. 2가지 모델중에 선택하려 합니다. 1
외부 사이트 구글 지도 불러오는 방법? 1
햇반, 후쿠시마산 쌀 추출물 사용' 논란에 CJ가 내놓은 해명 0
성범죄자 고지정보서 날라와서 동네가 난리났습니다. 0
<sample data qna_run.txt>
주문했는데 제품 박스에 4.2 라 적혀 있네요. 1
[질문] 천연대리석 깨졌는데 원래 아주작은 돌가루들로 된건가요?? 2
모니터 추천 부탁드립니다. 3
카카오톡 챗봇 만들기 어렵나요? 4
PS C:\work\python> cd 'c:\work\python'; ${env:PYTHONIOENCODING}='UTF-8'; ${env:PYTHONUNBUFFERED}='1'; & 'C:\Users\sungm\AppData\Local\Programs\Python\Python37\python.exe' 'c:\Users\sungm\.vscode\extensions\ms-python.python-2019.6.24221\pythonFiles\ptvsd_launcher.py' '--default' '--client' '--host' 'localhost' '--port' '49299' 'c:\work\python\knlp\qna_train.py'
C:\Users\sungm\AppData\Local\Programs\Python\Python37\lib\site-packages\konlpy\tag\_okt.py:16: UserWarning: "Twitter" has changed to "Okt" since KoNLPy v0.4.5.
warn('"Twitter" has changed to "Okt" since KoNLPy v0.4.5.')
C:\Users\sungm\AppData\Local\Programs\Python\Python37\lib\site-packages\jpype\_core.py:210: UserWarning:
-------------------------------------------------------------------------------
Deprecated: convertStrings was not specified when starting the JVM. The default
behavior in JPype will be False starting in JPype 0.8. The recommended setting
for new code is convertStrings=False. The legacy value of True was assumed for
this session. If you are a user of an application that reported this warning,
please file a ticket with the developer.
-------------------------------------------------------------------------------
""")
C:\Users\sungm\AppData\Local\Programs\Python\Python37\lib\site-packages\gensim\models\doc2vec.py:574: UserWarning: The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.
warnings.warn("The parameter `size` is deprecated, will be removed in 4.0.0, use `vector_size` instead.")
c:\work\python\knlp\qna_train.py:49: DeprecationWarning: Call to deprecated `iter` (Attribute will be removed in 4.0.0, use self.epochs instead).
doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
c:\work\python\knlp\qna_train.py:56: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
pprint(doc_vectorizer.most_similar('스위치/Noun'))
[('DS/Alpha', 0.4983493685722351),
('닌텐도/Noun', 0.48084473609924316),
('플스/Noun', 0.4436195194721222),
('전선/Noun', 0.4293632507324219),
('PS/Alpha', 0.4234338104724884),
('플러그/Noun', 0.422281950712204),
('디아블로/Noun', 0.4107228219509125),
('컴터/Noun', 0.4076959490776062),
('스팀/Noun', 0.40483903884887695),
('피파/Noun', 0.3986400365829468)]
c:\work\python\knlp\qna_train.py:57: DeprecationWarning: Call to deprecated `similarity` (Method will be removed in 4.0.0, use self.wv.similarity() instead).
pprint(doc_vectorizer.similarity('스위치/Noun', 'ㅋㅋ/KoreanParticle'))
0.051481076
PS C:\work\python>
<<https://github.com/hoho0443/classify_comment_emotion>> source 참조
댓글
댓글 쓰기