単語分布の実験

Alice in Wonderland で実験してみる．

昨日の投稿でTanaka (2021) の単語分布の興味深い研究を紹介した．これに触発されて私も調査してみた．

Tanaka (2021) はメルヴィルの「白鯨」を題材としていたが，私は別のテキストでキャロルの「不思議の国のアリス」を題材とした．

結論として，Tanaka (2021) と同様に「塊現象」らしきものが発生した．

以下に実行したコードと結果を示す．

import re
import unicodedata

def fullwidth_to_halfwidth(full_text):
    """
    全角を半角に変換する関数
    """
    return full_text.translate(str.maketrans(
        'ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺ'
        'Ａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ'
        '０１２３４５６７８９'
        '，．！？：；”“’‘（）［］｛｝〈〉《》「」『』【】＋－＝'
        '￥％＃＠＆＊',
        'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
        'abcdefghijklmnopqrstuvwxyz'
        '0123456789'
        ',.!?:;""\'\'()[]{}<>《》「」『』【】+-='
        '¥%#@&*'
    ))

def preprocess_text(text):
    """
    テキストを前処理する関数
    """
    # 全角を半角に変換
    text = fullwidth_to_halfwidth(text)
    
    # 小文字に変換
    text = text.lower()
    
    # 特殊文字や不要な空白の削除
    text = re.sub(r'[\r\n\t]', ' ', text)  # 改行、タブをスペースに置換
    text = re.sub(r'[^a-zA-Z0-9\s,.!?]', '', text)  # 英数字と基本的な句読点以外を削除
    text = re.sub(r'\\', '', text)  # バックスラッシュを削除
    text = re.sub(r'\s+', ' ', text).strip()  # 複数のスペースを一つにまとめる
    
    # 正規化（必要に応じて）
    text = unicodedata.normalize('NFKC', text)
    
    return text

def save_text_to_file(text, filename):
    """
    テキストをファイルに保存する関数
    """
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(text)

# テキスト
full_text = """

ここにテキストを使用
使用したテキストは下記から引用
https://www.gutenberg.org/files/11/11-h/11-h.htm

"""
full_text = preprocess_text(full_text)
save_text_to_file(full_text, 'full_text.txt')

# 結果の表示
print(full_text)

import pandas as pd
import matplotlib.pyplot as plt

    

def word_occurrences_df(full_text, target_word):
    # 文章を単語に分割
    words = full_text.split()
    
    # 単語をバイナリ形式に変換（target_wordを1、それ以外を0にする）
    binary_representation = [1 if word == target_word else 0 for word in words]
    
    # データフレームに変換
    df = pd.DataFrame({
        'Word_Index': range(len(binary_representation)),
        'Occurrence': binary_representation
    })
    
    return df


# 特定の単語を指定してデータフレームを作成
df = word_occurrences_df(full_text, "the")

# データフレームを表示
print(df.head())

# データフレームを時系列分析に使用して可視化
def plot_word_occurrences_ts(df, target_word):
    plt.figure(figsize=(36, 4))
    plt.plot(df['Word_Index'], df['Occurrence'], linestyle='None', marker='|')
    plt.title(f"Occurrences of the word '{target_word}' in the full_text (Time Series)")
    plt.xlabel("Word index")
    plt.ylabel(f"Occurrence of '{target_word}' (1 if present, 0 otherwise)")
    plt.show()

# 時系列プロットを作成
plot_word_occurrences_ts(df, "the")

参考文献

田中久美子 (2021)『言語とフラクタル-使用の集積の中にある偶然と必然-』東京大学出版会.
https://www.gutenberg.org/files/11/11-h/11-h.htm

いいね:

関連

コメントを残すコメントをキャンセル

共有:

いいね:

関連

コメントを残す コメントをキャンセル

コメントを残すコメントをキャンセル