分析コードのまとめ -個別単語の時系列可視化- - S-Linguistics

分析コードのまとめ -個別単語の時系列可視化-

投稿者: Sho オン 26/06/2024 19/06/2024 BLOG 日本語

分析コードをまとめていく．個別単語の時系列可視化編

前回に続いて，コードをまとめる．

今回は個別単語の時系列可視化にフォーカスをして分析をする．

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose

def create_binary_representation_df(text, target_word):
    """
    テキストの各単語に対して target_word なら 1, それ以外なら 0 を割り振り、DataFrameを作成する
    """
    tokens = tokenize(text)
    binary_representation = [1 if word == target_word else 0 for word in tokens]
    df_tg = pd.DataFrame({
        'Word_Index': range(len(tokens)),
        'Word': tokens,
        'binary': binary_representation
    })
    return df_tg

def plot_time_series_analysis(df_tg, target_word):
    """
    target_word に対する時系列分析をプロットする関数
    """
    plt.figure(figsize=(36, 2))
    plt.stem(df_tg['Word_Index'], df_tg['binary'], basefmt=" ")
    plt.xlabel('Word Index')
    plt.ylabel(f'Occurrence of "{target_word}" (1 if present, 0 otherwise)')
    plt.title(f'Occurrences of the word "{target_word}" in the full text (Time Series)')
    plt.show()

def plot_acf_analysis(df_tg):
    """
    target_word に対する自己相関関数（ACF）をプロットする関数
    """
    plt.figure(figsize=(15, 6))
    plot_acf(df_tg['binary'], lags=50)
    plt.title('Autocorrelation Function (ACF)')
    plt.xlabel('Lag')
    plt.ylabel('Autocorrelation')
    plt.show()

def plot_pacf_analysis(df_tg):
    """
    target_word に対する偏自己相関関数（PACF）をプロットする関数
    """
    plt.figure(figsize=(15, 6))
    plot_pacf(df_tg['binary'], lags=50)
    plt.title('Partial Autocorrelation Function (PACF)')
    plt.xlabel('Lag')
    plt.ylabel('Partial Autocorrelation')
    plt.show()

def plot_moving_average(df_tg, window_size):
    """
    target_word に対する移動平均をプロットする関数
    """
    df_tg['Moving_Average'] = df_tg['binary'].rolling(window=window_size).mean()
    plt.figure(figsize=(36, 4))
    plt.plot(df_tg['Word_Index'], df_tg['binary'], linestyle='None', marker='|', markersize=10, label='Occurrences')
    plt.plot(df_tg['Word_Index'], df_tg['Moving_Average'], color='orange', label=f'Moving Average (window={window_size})')
    plt.title(f"Moving Average of the word '{target_word}' in the full_text (Time Series)")
    plt.xlabel("Word index")
    plt.ylabel(f"Occurrence of '{target_word}' (1 if present, 0 otherwise)")
    plt.legend()
    plt.show()

def decompose_time_series(df_tg, target_word):
    """
    target_word に対する時系列分解を行う関数
    """
    df_tg['binary_cumsum'] = df_tg['binary'].cumsum()
    result = seasonal_decompose(df_tg['binary_cumsum'], model='additive', period=1)
    result.plot()
    plt.show()


# テキストのバイナリ表現を作成
df_tg = create_binary_representation_df(processed_text, target_word)

# 時系列分析のプロット
plot_time_series_analysis(df_tg, target_word)

# 自己相関関数のプロット
plot_acf_analysis(df_tg)

# 偏自己相関関数のプロット
plot_pacf_analysis(df_tg)

# 移動平均のプロット（例としてウィンドウサイズ10を使用）
plot_moving_average(df_tg, window_size=10)

# 時系列分解のプロット
decompose_time_series(df_tg, target_word)

関連

コメントを残すコメントをキャンセル