分析コードのまとめ -個別単語の可視化- - S-Linguistics

分析コードのまとめ -個別単語の可視化-

投稿者: Sho オン 25/06/2024 18/06/2024 BLOG 日本語

分析コードをまとめていく．個別単語の可視化編

前回に続いて，コードをまとめる．

今回は個別単語の基本情報の可視化にフォーカスをして分析をする．

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def plot_sentence_word_counts(sentence_word_counts):
    """
    文ごとの単語の出現回数をプロットする関数
    """
    plt.figure(figsize=(10, 5))
    plt.bar(range(1, len(sentence_word_counts) + 1), sentence_word_counts)
    plt.xlabel('Sentence Number')
    plt.ylabel('Word Count')
    plt.title(f'Word Count per Sentence for "{target_word}"')
    plt.show()

def plot_word_positions(sentence_positions, sentences):
    """
    単語の出現位置のヒートマップをプロットする関数
    """
    # 各文の長さを計算
    sentence_lengths = [len(sentence.split()) for sentence in sentences]
    
    # 出現位置をヒートマップ用に変換
    max_length = max(sentence_lengths)
    heatmap_data = np.zeros((len(sentences), max_length))
    
    for i, positions in enumerate(sentence_positions):
        if i < len(sentences):
            for pos in positions:
                if pos <= max_length:
                    heatmap_data[i, pos - 1] = 1
    
    plt.figure(figsize=(10, 15))
    sns.heatmap(heatmap_data, cmap='Blues', cbar=False, xticklabels=False, yticklabels=range(1, len(sentences) + 1))
    plt.xlabel('Word Position in Sentence')
    plt.ylabel('Sentence Number')
    plt.title(f'Word Positions in Sentences for "{target_word}"')
    plt.show()

def plot_sentence_lengths(sentences_with_target_word):
    """
    単語を含む文の長さ分布をプロットする関数
    """
    sentence_lengths = [len(sentence.split()) for sentence in sentences_with_target_word]
    
    plt.figure(figsize=(10, 5))
    plt.hist(sentence_lengths, bins=20, edgecolor='black')
    plt.xlabel('Number of Words in Sentence')
    plt.ylabel('Frequency')
    plt.title(f'Sentence Length Distribution for Sentences Containing "{target_word}"')
    plt.show()

def plot_cumulative_word_frequency(word_freq):
    """
    単語の累積頻度分布をプロットする関数
    """
    word_counts = list(word_freq.values())
    word_counts.sort(reverse=True)
    cumulative_counts = np.cumsum(word_counts)
    cumulative_percentage = cumulative_counts / cumulative_counts[-1] * 100
    
    plt.figure(figsize=(10, 5))
    plt.plot(cumulative_percentage)
    plt.xlabel('Rank of Words')
    plt.ylabel('Cumulative Frequency (%)')
    plt.title('Cumulative Word Frequency Distribution')
    plt.show()

def plot_line_sentence_word_counts(sentence_word_counts):
    """
    文ごとの単語の出現回数の折れ線グラフをプロットする関数
    """
    plt.figure(figsize=(10, 5))
    plt.plot(range(1, len(sentence_word_counts) + 1), sentence_word_counts, marker='o')
    plt.xlabel('Sentence Number')
    plt.ylabel('Word Count')
    plt.title(f'Word Count per Sentence (Line Graph) for "{target_word}"')
    plt.show()

def plot_word_length_distribution(word_lengths):
    """
    単語の長さの分布をプロットする関数
    """
    plt.figure(figsize=(10, 5))
    plt.hist(word_lengths, bins=20, edgecolor='black')
    plt.xlabel('Word Length')
    plt.ylabel('Frequency')
    plt.title('Word Length Distribution')
    plt.show()


# 前処理されたテキストデータを使用して解析を実行
analysis = analyze_target_word(processed_text, target_word)

# 解析結果を表示
display_target_word_analysis(analysis, target_word)

# 文ごとの単語の出現回数のプロット
plot_sentence_word_counts(analysis['sentence_word_counts'])

# 単語の出現位置のヒートマップのプロット
plot_word_positions(analysis['sentence_positions'], analysis['sentences_with_target_word'])

# 単語を含む文の長さ分布のプロット
plot_sentence_lengths(analysis['sentences_with_target_word'])

# 単語の累積頻度分布のプロット
plot_cumulative_word_frequency(stats["word_freq"])

# 文ごとの単語の出現回数の折れ線グラフのプロット
plot_line_sentence_word_counts(analysis['sentence_word_counts'])

# 単語の長さの分布のプロット
word_lengths = [len(word) for word in tokenize(processed_text)]
plot_word_length_distribution(word_lengths)

関連

コメントを残すコメントをキャンセル