import numpy as np
from scipy.stats import chi2_contingency
from statsmodels.stats.multitest import multipletests
import pandas as pd

def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):
    """
    word             : the word being tested
    counts_recent    : raw count in week 5
    counts_reference : raw count in weeks 1-4
    total_recent     : total tokens in week 5
    total_reference  : total tokens in weeks 1-4
    """
    a = counts_recent      # word in recent
    b = counts_reference   # word in reference
    c = total_recent - a   # non-word in recent
    d = total_reference - b  # non-word in reference

    contingency = np.array([[a, b],
                             [c, d]])

    # --- Chi-Squared ---
    chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)

    # --- Log-Likelihood (G²) ---
    # G² = 2 * sum(observed * log(observed / expected))
    # scipy's chi2_contingency with lambda_="log-likelihood" computes this
    g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_="log-likelihood")

    # --- Effect sizes ---
    freq_recent    = a / total_recent
    freq_reference = b / total_reference

    pct_diff = (freq_recent - freq_reference) / freq_reference * 100

    # Avoid log(0) with a small epsilon
    eps = 1e-9
    log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))

    return {
        "word":           word,
        "freq_recent":    freq_recent,
        "freq_reference": freq_reference,
        "pct_diff":       pct_diff,
        "log_ratio":      log_ratio,
        "chi2":           chi2_stat,
        "p_chi2":         p_chi2,
        "g2":             g2_stat,
        "p_g2":           p_g2,
    }

# Example data
counts_recent = {'eend': 150, 'tafel': 101, 'fiets': 102}
counts_reference = {'eend': 77, 'tafel': 100, 'fiets': 142}
total_recent = sum(counts_recent.values())
total_reference = sum(counts_reference.values())

# Run tests on whole vocabulary, including correction for multiple tests
# (false discovery rate).

results = [
    corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),
                 total_recent, total_reference)
    for word in counts_recent]

# FDR correction across all words
p_values = [r["p_g2"] for r in results]
_, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh")

for r, p_adj in zip(results, p_adjusted):
    r["p_g2_adjusted"] = p_adj

results = pd.DataFrame(results)
results

# Significant according to Chi2
results[results['p_chi2'] < 0.05]

# Significant according to G2 (LLR)
results[results['p_g2_adjusted'] < 0.05]

	word	freq_recent	freq_reference	pct_diff	log_ratio	chi2	p_chi2	g2	p_g2	p_g2_adjusted
0	eend	0.424929	0.241379	76.042088	0.815920	25.238117	5.067080e-07	24.764140	6.479173e-07	0.000002
1	tafel	0.286119	0.313480	-8.728045	-0.131756	0.598371	4.392004e-01	0.474701	4.908322e-01	0.490832
2	fiets	0.288952	0.445141	-35.087579	-0.623434	17.676782	2.618028e-05	17.051468	3.638025e-05	0.000055

	word	freq_recent	freq_reference	pct_diff	log_ratio	chi2	p_chi2	g2	p_g2	p_g2_adjusted
0	eend	0.424929	0.241379	76.042088	0.815920	25.238117	5.067080e-07	24.764140	6.479173e-07	0.000002
2	fiets	0.288952	0.445141	-35.087579	-0.623434	17.676782	2.618028e-05	17.051468	3.638025e-05	0.000055

	word	freq_recent	freq_reference	pct_diff	log_ratio	chi2	p_chi2	g2	p_g2	p_g2_adjusted
0	eend	0.424929	0.241379	76.042088	0.815920	25.238117	5.067080e-07	24.764140	6.479173e-07	0.000002
2	fiets	0.288952	0.445141	-35.087579	-0.623434	17.676782	2.618028e-05	17.051468	3.638025e-05	0.000055