From ce8ed07327a3a28ae1e875444f23d1d48c4d08dd Mon Sep 17 00:00:00 2001 From: Peter Kleiweg Date: Sat, 20 Jun 2026 12:41:48 +0200 Subject: [PATCH] update stijgers2json.py --- collect.sh | 20 ++++++++------------ python/stijgers2json.py | 16 ++++++++++++++-- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/collect.sh b/collect.sh index 1faeef9..8e9d124 100755 --- a/collect.sh +++ b/collect.sh @@ -143,6 +143,14 @@ do | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ | items2count > $part-overige-namen-$ds-$i + say $part-allewoorden-$ds-$i + alto \ + 'fp://node[(@pt and not(@pt="let" or @rel="mwp" or @neclass)) or (@cat="mwu" and not(.//node[@neclass]))]' \ + 'tt:%l\ti%d\t%I' $files \ + | sed -e 's/pubdate: "[-0-9]*"//' \ + | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ + | items2count > $part-allewoorden-$ds-$i + # tellingen met tags en postags say $part-nieuwe-woorden-extra-$ds-$i @@ -166,18 +174,6 @@ do | sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \ > $part-nieuwe-adjww-extra-$ds-$i - # kale tellingen - - say $part-allewoorden-$ds-$i - alto \ - 'fp://node[(@pt and not(@pt="let" or @rel="mwp" or @neclass)) or (@cat="mwu" and not(.//node[@neclass]))]' \ - 'tt:%l\t%I' $files \ - | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ - | sed -e 's/\t.*//' | uniq -c \ - | grep -v '^ *1 ' \ - | sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \ - > $part-allewoorden-$ds-$i - done # score diff --git a/python/stijgers2json.py b/python/stijgers2json.py index 44bfbd1..c41e71e 100755 --- a/python/stijgers2json.py +++ b/python/stijgers2json.py @@ -56,17 +56,24 @@ oudfile = sys.argv[1] nieuwfile = sys.argv[2] jsonfile = sys.argv[3] +extras = {} counts_recent = {} counts_reference = {} with open(oudfile, "rt", encoding="utf-8") as fp: for line in fp: aa = line.split("\t") - counts_reference[aa[1].strip()] = int(aa[0]) + w = aa[1].strip() + counts_reference[w] = int(aa[0]) + if len(aa) > 2: + extras[w] = '\t'.join(aa[2:]).strip() with open(nieuwfile, "rt", encoding="utf-8") as fp: for line in fp: aa = line.split("\t") - counts_recent[aa[1].strip()] = int(aa[0]) + w = aa[1].strip() + counts_recent[w] = int(aa[0]) + if len(aa) > 2: + extras[w] = '\t'.join(aa[2:]).strip() for key in counts_recent: if not key in counts_reference: @@ -90,6 +97,11 @@ _, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh") for r, p_adj in zip(results, p_adjusted): r["p_g2_adjusted"] = p_adj +for i in range(len(results)): + results[i]["n"] = int(counts_recent[results[i]["word"]]) + if len(extras) > 0: + results[i]["extras"] = extras.get(results[i]["word"], '') + o = {} #o['up'] = sorted([x for x in results if x['p_g2'] < .05 and x['pct_diff'] > 0], key=lambda x: x['g2'], reverse=True)[:40] #o['dn'] = sorted([x for x in results if x['p_g2'] < .05 and x['pct_diff'] < 0], key=lambda x: x['g2'], reverse=True)[:40]