分析コードをまとめていく.共起語ネットワーク編
前回に続いて,コードをまとめる.
今回は共起語ネットワークにフォーカスをして分析をする.
import nltk
import re
import networkx as nx
import matplotlib.pyplot as plt
from itertools import combinations
from collections import Counter
nltk.download('punkt')
def preprocess_text(text):
"""
テキストの前処理(句読点の除去)
"""
text = re.sub(r'[^\w\s]', '', text)
tokens = nltk.word_tokenize(text)
return tokens
def build_cooccurrence_matrix(tokens, window_size=2):
"""
共起行列を構築する関数
"""
cooccurrence_counts = Counter()
for i in range(len(tokens) - window_size + 1):
window = tokens[i:i + window_size]
for w1, w2 in combinations(window, 2):
if w1 != w2: # 同じ単語のペアは除外
cooccurrence_counts[(w1, w2)] += 1
return cooccurrence_counts
def plot_cooccurrence_network(cooccurrence_counts, top_n=30):
"""
共起ネットワークをプロットする関数
"""
G = nx.Graph()
for (w1, w2), count in cooccurrence_counts.most_common(top_n):
G.add_edge(w1, w2, weight=count)
pos = nx.spring_layout(G, k=1)
plt.figure(figsize=(60, 60))
nx.draw_networkx_nodes(G, pos, node_size=7000, node_color='lightblue')
nx.draw_networkx_edges(G, pos, width=2)
nx.draw_networkx_labels(G, pos, font_size=30, font_family='sans-serif')
edge_labels = {(w1, w2): count for (w1, w2), count in cooccurrence_counts.most_common(top_n)}
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
plt.title('Co-occurrence Network')
plt.show()
# テキストの前処理とトークン化
tokens = preprocess_text(processed_text)
# 共起行列の構築
cooccurrence_counts = build_cooccurrence_matrix(tokens)
# 共起ネットワークのプロット
plot_cooccurrence_network(cooccurrence_counts)