分析コードのまとめ -共起語ネットワーク- - S-Linguistics

分析コードのまとめ -共起語ネットワーク-

投稿者: Sho オン 29/06/2024 20/06/2024 BLOG 日本語

分析コードをまとめていく．共起語ネットワーク編

前回に続いて，コードをまとめる．

今回は共起語ネットワークにフォーカスをして分析をする．

import nltk
import re
import networkx as nx
import matplotlib.pyplot as plt
from itertools import combinations
from collections import Counter

nltk.download('punkt')

def preprocess_text(text):
    """
    テキストの前処理（句読点の除去）
    """
    text = re.sub(r'[^\w\s]', '', text)
    tokens = nltk.word_tokenize(text)
    return tokens

def build_cooccurrence_matrix(tokens, window_size=2):
    """
    共起行列を構築する関数
    """
    cooccurrence_counts = Counter()
    for i in range(len(tokens) - window_size + 1):
        window = tokens[i:i + window_size]
        for w1, w2 in combinations(window, 2):
            if w1 != w2:  # 同じ単語のペアは除外
                cooccurrence_counts[(w1, w2)] += 1
    return cooccurrence_counts

def plot_cooccurrence_network(cooccurrence_counts, top_n=30):
    """
    共起ネットワークをプロットする関数
    """
    G = nx.Graph()
    for (w1, w2), count in cooccurrence_counts.most_common(top_n):
        G.add_edge(w1, w2, weight=count)
    
    pos = nx.spring_layout(G, k=1)
    plt.figure(figsize=(60, 60))
    nx.draw_networkx_nodes(G, pos, node_size=7000, node_color='lightblue')
    nx.draw_networkx_edges(G, pos, width=2)
    nx.draw_networkx_labels(G, pos, font_size=30, font_family='sans-serif')
    
    edge_labels = {(w1, w2): count for (w1, w2), count in cooccurrence_counts.most_common(top_n)}
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
    
    plt.title('Co-occurrence Network')
    plt.show()


# テキストの前処理とトークン化
tokens = preprocess_text(processed_text)

# 共起行列の構築
cooccurrence_counts = build_cooccurrence_matrix(tokens)

# 共起ネットワークのプロット
plot_cooccurrence_network(cooccurrence_counts)

関連

コメントを残すコメントをキャンセル