小説内の単語の長さ - S-Linguistics

投稿者: Sho オン 17/06/2024 14/06/2024 BLOG 日本語

Alice in Wonderland における共起語

先日からAlice in Wonderland の分析を行なっている．

今回は共起後ネットワークの作成をしてみた．

以下に実行したコードと結果を示す．（前回のコードの続きであるという前提）

import re
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from collections import Counter
from nltk.corpus import stopwords
import nltk

# NLTKデータセットのダウンロード
nltk.download('stopwords')



# テキストを前処理する関数
def preprocess_text(text):
    text = text.lower()  # 小文字に変換
    text = re.sub(r'[^\w\s]', '', text)  # 句読点を削除
    return text

# 単語リストを取得する関数
def get_words(text):
    text = preprocess_text(text)
    words = text.split()
    return words

# ストップワードを除外する関数
def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    return filtered_words

# 共起語ネットワークを作成する関数
def create_cooccurrence_network(words, target_words, window_size=2):
    cooccurrence_dict = Counter()
    for i, word in enumerate(words):
        if word in target_words:
            start = max(0, i - window_size)
            end = min(len(words), i + window_size + 1)
            for j in range(start, end):
                if i != j and words[j] in target_words:
                    cooccurrence_dict[(word, words[j])] += 1
    
    G = nx.Graph()
    for (word1, word2), weight in cooccurrence_dict.items():
        G.add_edge(word1, word2, weight=weight)
    
    return G

# 単語の出現頻度を取得する関数
def get_top_n_words(words, n=50):
    word_freq = Counter(words)
    top_n_words = [word for word, freq in word_freq.most_common(n)]
    return top_n_words

# 単語リストの取得
words = get_words(full_text)

# ストップワードを除外
filtered_words = remove_stopwords(words)

# 頻出単語トップ50を取得
top_50_words = get_top_n_words(filtered_words, 50)

# 共起語ネットワークの作成
G = create_cooccurrence_network(filtered_words, top_50_words)

# 共起語ネットワークのプロット
def plot_cooccurrence_network(G):
    pos = nx.spring_layout(G)
    plt.figure(figsize=(12, 10))
    nx.draw(G, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=3000, font_size=15, font_weight='bold')
    edge_labels = nx.get_edge_attributes(G, 'weight')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
    plt.title("Co-occurrence Network (Top 50 Words Excluding Stopwords)")
    plt.show()

# プロットの実行
plot_cooccurrence_network(G)

参考文献

https://www.gutenberg.org/files/11/11-h/11-h.htm

関連

コメントを残すコメントをキャンセル