分析コードのまとめ -N-gram 分析- - S-Linguistics

分析コードのまとめ -N-gram 分析-

投稿者: Sho オン 28/06/2024 20/06/2024 BLOG 日本語

分析コードをまとめていく．N-gram 分析編

前回に続いて，コードをまとめる．

今回はN-gram 分析にフォーカスをして分析をする．

import nltk
from collections import Counter
import matplotlib.pyplot as plt
import re

nltk.download('punkt')

def generate_ngrams(text, n):
    """
    テキストからNグラムを生成する関数（句読点を除外）
    """
    # 句読点を除去
    text = re.sub(r'[^\w\s]', '', text)
    tokens = nltk.word_tokenize(text)
    ngrams = list(nltk.ngrams(tokens, n))
    return ngrams

def plot_ngram_frequencies(ngrams, n, top=10):
    """
    Nグラムの出現頻度をプロットする関数
    """
    if not ngrams:
        print(f"No {n}-grams found in the text.")
        return

    ngram_counts = Counter(ngrams)
    common_ngrams = ngram_counts.most_common(top)
    
    if not common_ngrams:
        print(f"No common {n}-grams found in the text.")
        return
    
    ngrams, counts = zip(*common_ngrams)
    
    plt.figure(figsize=(10, 6))
    plt.bar(range(top), counts, tick_label=[' '.join(ngram) for ngram in ngrams])
    plt.xlabel(f'Top {top} {n}-grams')
    plt.ylabel('Frequency')
    plt.title(f'Top {top} Most Common {n}-grams')
    plt.xticks(rotation=45)
    plt.show()


# バイグラム（2-gram）の生成とプロット
bigrams = generate_ngrams(processed_text, 2)
plot_ngram_frequencies(bigrams, 2)

# トライグラム（3-gram）の生成とプロット
trigrams = generate_ngrams(processed_text, 3)
plot_ngram_frequencies(trigrams, 3)

関連

コメントを残すコメントをキャンセル