From ce8ed07327a3a28ae1e875444f23d1d48c4d08dd Mon Sep 17 00:00:00 2001
From: Peter Kleiweg <kleiweg@ziggo.nl>
Date: Sat, 20 Jun 2026 12:41:48 +0200
Subject: [PATCH] update stijgers2json.py

---
 collect.sh              | 20 ++++++++------------
 python/stijgers2json.py | 16 ++++++++++++++--
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/collect.sh b/collect.sh
index 1faeef9..8e9d124 100755
--- a/collect.sh
+++ b/collect.sh
@@ -143,6 +143,14 @@ do
             | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
             | items2count > $part-overige-namen-$ds-$i
 
+        say $part-allewoorden-$ds-$i
+        alto \
+            'fp://node[(@pt and not(@pt="let" or @rel="mwp" or @neclass)) or (@cat="mwu" and not(.//node[@neclass]))]' \
+            'tt:%l\ti%d\t%I' $files \
+            | sed -e 's/pubdate: "[-0-9]*"//' \
+            | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
+            | items2count > $part-allewoorden-$ds-$i
+
         # tellingen met tags en postags
 
         say $part-nieuwe-woorden-extra-$ds-$i
@@ -166,18 +174,6 @@ do
             | sed -e 's/\([0-9]\)  */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
             > $part-nieuwe-adjww-extra-$ds-$i
 
-        # kale tellingen
-
-        say $part-allewoorden-$ds-$i
-        alto \
-            'fp://node[(@pt and not(@pt="let" or @rel="mwp" or @neclass)) or (@cat="mwu" and not(.//node[@neclass]))]' \
-            'tt:%l\t%I' $files \
-            | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
-            | sed -e 's/\t.*//' | uniq -c \
-            | grep -v '^ *1 ' \
-            | sed -e 's/\([0-9]\)  */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
-            > $part-allewoorden-$ds-$i
-
     done
 
     # score
diff --git a/python/stijgers2json.py b/python/stijgers2json.py
index 44bfbd1..c41e71e 100755
--- a/python/stijgers2json.py
+++ b/python/stijgers2json.py
@@ -56,17 +56,24 @@ oudfile = sys.argv[1]
 nieuwfile = sys.argv[2]
 jsonfile = sys.argv[3]
 
+extras = {}
 counts_recent = {}
 counts_reference = {}
 
 with open(oudfile, "rt", encoding="utf-8") as fp:
     for line in fp:
         aa = line.split("\t")
-        counts_reference[aa[1].strip()] = int(aa[0])
+        w = aa[1].strip()
+        counts_reference[w] = int(aa[0])
+        if len(aa) > 2:
+            extras[w] = '\t'.join(aa[2:]).strip()
 with open(nieuwfile, "rt", encoding="utf-8") as fp:
     for line in fp:
         aa = line.split("\t")
-        counts_recent[aa[1].strip()] = int(aa[0])
+        w = aa[1].strip()
+        counts_recent[w] = int(aa[0])
+        if len(aa) > 2:
+            extras[w] = '\t'.join(aa[2:]).strip()
 
 for key in counts_recent:
     if not key in counts_reference:
@@ -90,6 +97,11 @@ _, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh")
 for r, p_adj in zip(results, p_adjusted):
     r["p_g2_adjusted"] = p_adj
 
+for i in range(len(results)):
+    results[i]["n"] = int(counts_recent[results[i]["word"]])
+    if len(extras) > 0:
+        results[i]["extras"] = extras.get(results[i]["word"], '')
+
 o = {}
 #o['up'] = sorted([x for x in results if x['p_g2'] < .05 and x['pct_diff'] > 0], key=lambda x: x['g2'], reverse=True)[:40]
 #o['dn'] = sorted([x for x in results if x['p_g2'] < .05 and x['pct_diff'] < 0], key=lambda x: x['g2'], reverse=True)[:40]