分析コードをまとめていく.統計検定編


前回に続いて,コードをまとめる.

今回は統計的な検定にフォーカスをして分析をする.

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from scipy.stats import shapiro, norm, gamma, lognorm, expon, chi2, kstest
from statsmodels.stats.diagnostic import acorr_ljungbox
import warnings

# 警告を無視
warnings.filterwarnings('ignore')

def create_binary_representation_df(text, target_word):
    """
    テキストの各単語に対して target_word なら 1, それ以外なら 0 を割り振り、DataFrameを作成する
    """
    tokens = tokenize(text)
    binary_representation = [1 if word == target_word else 0 for word in tokens]
    df_tg = pd.DataFrame({
        'Word_Index': range(len(tokens)),
        'Word': tokens,
        'binary': binary_representation
    })
    return df_tg

def perform_stat_tests(df_tg):
    """
    様々な統計検定を実施し、結果を出力する関数
    """
    results = {}

    # 正規性検定(シャピロ・ウィルク検定)
    stat, p = shapiro(df_tg['binary'])
    results['Shapiro-Wilk Test'] = (stat, p)
    if p > 0.05:
        print(f"Shapiro-Wilk Test: The distribution seems normal (fail to reject H0), p-value = {p}")
    else:
        print(f"Shapiro-Wilk Test: The distribution does NOT seem normal (reject H0), p-value = {p}")

    # ランダム性検定(Runs Test)
    n1 = sum(df_tg['binary'])
    n2 = len(df_tg) - n1
    runs = 1 + sum(df_tg['binary'].values[:-1] != df_tg['binary'].values[1:])
    expected_runs = (2 * n1 * n2) / (n1 + n2) + 1
    var_runs = (2 * n1 * n2 * (2 * n1 * n2 - n1 - n2)) / ((n1 + n2)**2 * (n1 + n2 - 1))
    z = (runs - expected_runs) / np.sqrt(var_runs)
    p = 2 * (1 - norm.cdf(abs(z)))  # 両側検定
    results['Runs Test'] = (z, p)
    if p > 0.05:
        print(f"Runs Test: The sequence appears random (fail to reject H0), p-value = {p}")
    else:
        print(f"Runs Test: The sequence does NOT appear random (reject H0), p-value = {p}")

    # Ljung-Box 検定
    lb_value, lb_pvalue = acorr_ljungbox(df_tg['binary'], lags=[10], return_df=True).iloc[0]
    results['Ljung-Box Test'] = (lb_value, lb_pvalue)
    if lb_pvalue > 0.05:
        print(f"Ljung-Box Test: The data does not show significant autocorrelation (fail to reject H0), p-value = {lb_pvalue}")
    else:
        print(f"Ljung-Box Test: The data shows significant autocorrelation (reject H0), p-value = {lb_pvalue}")

    # 主要な分布に従うかの検定
    distributions = {
        'Normal': norm,
        'Gamma': gamma,
        'LogNormal': lognorm,
        'Exponential': expon,
        'Chi-Square': chi2
    }

    for dist_name, dist in distributions.items():
        try:
            param = dist.fit(df_tg['binary'])
            stat, p = kstest(df_tg['binary'], dist.cdf, args=param)
            results[f'{dist_name} Distribution'] = (stat, p)
            if p > 0.05:
                print(f"{dist_name} Distribution: The data fits the distribution (fail to reject H0), p-value = {p}")
            else:
                print(f"{dist_name} Distribution: The data does NOT fit the distribution (reject H0), p-value = {p}")
        except Exception as e:
            results[f'{dist_name} Distribution'] = (None, None)
            print(f"{dist_name} Distribution: Error in fitting, {str(e)}")

    return results

# テキストのバイナリ表現を作成
df_tg = create_binary_representation_df(processed_text, target_word)

# 統計検定の実施
test_results = perform_stat_tests(df_tg)

# 結果の表示
results_df = pd.DataFrame.from_dict(test_results, orient='index', columns=['Statistic', 'p-value'])
print(results_df)

コメントを残す

メールアドレスが公開されることはありません。 が付いている欄は必須項目です