今回もGensimを用いて言語の分析をする.
今回はgensim の Phrases モジュールで複数の単語からなるフレーズ(例: “natural language”)を自動的に検出する
from gensim.models import Phrases
from gensim.models.phrases import Phraser
import re
# サンプル文
sentences = [
"Gensim is a useful library for natural language processing.",
"Topic modelling is an interesting field in data science.",
"Word2Vec and Doc2Vec are popular models.",
"Natural language processing involves machine learning."
]
# 前処理関数:小文字化、非単語文字の除去、単純なスペースでのトークン化
def preprocess(sentence):
sentence = sentence.lower()
sentence = re.sub(r'\W+', ' ', sentence)
tokens = sentence.split()
return tokens
processed_sentences = [preprocess(sentence) for sentence in sentences]
# フレーズ検出の実行
phrases = Phrases(processed_sentences, min_count=1, threshold=1)
phraser = Phraser(phrases)
# 検出されたフレーズを含む文を表示
for sentence in processed_sentences:
print(phraser[sentence])
‘gensim’, ‘is’, ‘a’, ‘useful’, ‘library’, ‘for’, ‘natural_language’, ‘processing’
‘topic’, ‘modelling’, ‘is’, ‘an’, ‘interesting’, ‘field’, ‘in’, ‘data’, ‘science’
‘word2vec’, ‘and’, ‘doc2vec’, ‘are’, ‘popular’, ‘models’
‘natural_language’, ‘processing’, ‘involves’, ‘machine’, ‘learning’