分析コードのまとめ -基本統計量可視化- - S-Linguistics

分析コードのまとめ -基本統計量可視化-

投稿者: Sho オン 22/06/2024 18/06/2024 BLOG 日本語

分析コードのまとめていく．基本統計量可視化編

前回に続いて，コードをまとめる．

今回はテキストデータの基本統計量を可視化する．

# 必要なnltkデータのダウンロード（初回実行時のみ）
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
import numpy as np


def plot_word_frequency(word_freq):
    """
    単語の出現頻度をプロットする関数
    """
    most_common_words = word_freq.most_common(20)
    words, counts = zip(*most_common_words)
    
    plt.figure(figsize=(10, 5))
    plt.bar(words, counts)
    plt.xticks(rotation=45)
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.title('Top 20 Words Frequency')
    plt.show()

def plot_sentence_lengths(sentence_lengths):
    """
    文章の長さの分布をプロットする関数
    """
    plt.figure(figsize=(10, 5))
    plt.hist(sentence_lengths, bins=20, edgecolor='black')
    plt.xlabel('Number of Words')
    plt.ylabel('Number of Sentences')
    plt.title('Sentence Length Distribution')
    plt.show()

def plot_word_length_distribution(word_lengths):
    """
    単語の長さの分布をプロットする関数
    """
    plt.figure(figsize=(10, 5))
    plt.hist(word_lengths, bins=20, edgecolor='black')
    plt.xlabel('Word Length')
    plt.ylabel('Frequency')
    plt.title('Word Length Distribution')
    plt.show()

def plot_cumulative_word_frequency(word_freq):
    """
    単語の累積頻度分布をプロットする関数
    """
    word_counts = list(word_freq.values())
    word_counts.sort(reverse=True)
    cumulative_counts = np.cumsum(word_counts)
    cumulative_percentage = cumulative_counts / cumulative_counts[-1] * 100
    
    plt.figure(figsize=(10, 5))
    plt.plot(cumulative_percentage)
    plt.xlabel('Rank of Words')
    plt.ylabel('Cumulative Frequency (%)')
    plt.title('Cumulative Word Frequency Distribution')
    plt.show()

def plot_pos_tag_frequency(text):
    """
    品詞タグ付けの頻度分布をプロットする関数
    """
    tokens = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    pos_counts = Counter(tag for word, tag in pos_tags)
    
    tags, counts = zip(*pos_counts.items())
    
    plt.figure(figsize=(10, 5))
    plt.bar(tags, counts)
    plt.xticks(rotation=45)
    plt.xlabel('POS Tags')
    plt.ylabel('Frequency')
    plt.title('POS Tag Frequency Distribution')
    plt.show()


# 出現頻度のプロット
plot_word_frequency(stats["word_freq"])

# 文章の長さの分布のプロット
plot_sentence_lengths(stats["sentence_lengths"])

# 単語の長さの分布のプロット
word_lengths = [len(word) for word in tokenize(processed_text)]
plot_word_length_distribution(word_lengths)

# 単語の累積頻度分布のプロット
plot_cumulative_word_frequency(stats["word_freq"])

# 品詞タグ付けの頻度分布のプロット
plot_pos_tag_frequency(processed_text)

関連

コメントを残すコメントをキャンセル