update stijgers2json.py
This commit is contained in:
20
collect.sh
20
collect.sh
@@ -143,6 +143,14 @@ do
|
|||||||
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
|
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
|
||||||
| items2count > $part-overige-namen-$ds-$i
|
| items2count > $part-overige-namen-$ds-$i
|
||||||
|
|
||||||
|
say $part-allewoorden-$ds-$i
|
||||||
|
alto \
|
||||||
|
'fp://node[(@pt and not(@pt="let" or @rel="mwp" or @neclass)) or (@cat="mwu" and not(.//node[@neclass]))]' \
|
||||||
|
'tt:%l\ti%d\t%I' $files \
|
||||||
|
| sed -e 's/pubdate: "[-0-9]*"//' \
|
||||||
|
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
|
||||||
|
| items2count > $part-allewoorden-$ds-$i
|
||||||
|
|
||||||
# tellingen met tags en postags
|
# tellingen met tags en postags
|
||||||
|
|
||||||
say $part-nieuwe-woorden-extra-$ds-$i
|
say $part-nieuwe-woorden-extra-$ds-$i
|
||||||
@@ -166,18 +174,6 @@ do
|
|||||||
| sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
|
| sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
|
||||||
> $part-nieuwe-adjww-extra-$ds-$i
|
> $part-nieuwe-adjww-extra-$ds-$i
|
||||||
|
|
||||||
# kale tellingen
|
|
||||||
|
|
||||||
say $part-allewoorden-$ds-$i
|
|
||||||
alto \
|
|
||||||
'fp://node[(@pt and not(@pt="let" or @rel="mwp" or @neclass)) or (@cat="mwu" and not(.//node[@neclass]))]' \
|
|
||||||
'tt:%l\t%I' $files \
|
|
||||||
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
|
|
||||||
| sed -e 's/\t.*//' | uniq -c \
|
|
||||||
| grep -v '^ *1 ' \
|
|
||||||
| sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
|
|
||||||
> $part-allewoorden-$ds-$i
|
|
||||||
|
|
||||||
done
|
done
|
||||||
|
|
||||||
# score
|
# score
|
||||||
|
|||||||
@@ -56,17 +56,24 @@ oudfile = sys.argv[1]
|
|||||||
nieuwfile = sys.argv[2]
|
nieuwfile = sys.argv[2]
|
||||||
jsonfile = sys.argv[3]
|
jsonfile = sys.argv[3]
|
||||||
|
|
||||||
|
extras = {}
|
||||||
counts_recent = {}
|
counts_recent = {}
|
||||||
counts_reference = {}
|
counts_reference = {}
|
||||||
|
|
||||||
with open(oudfile, "rt", encoding="utf-8") as fp:
|
with open(oudfile, "rt", encoding="utf-8") as fp:
|
||||||
for line in fp:
|
for line in fp:
|
||||||
aa = line.split("\t")
|
aa = line.split("\t")
|
||||||
counts_reference[aa[1].strip()] = int(aa[0])
|
w = aa[1].strip()
|
||||||
|
counts_reference[w] = int(aa[0])
|
||||||
|
if len(aa) > 2:
|
||||||
|
extras[w] = '\t'.join(aa[2:]).strip()
|
||||||
with open(nieuwfile, "rt", encoding="utf-8") as fp:
|
with open(nieuwfile, "rt", encoding="utf-8") as fp:
|
||||||
for line in fp:
|
for line in fp:
|
||||||
aa = line.split("\t")
|
aa = line.split("\t")
|
||||||
counts_recent[aa[1].strip()] = int(aa[0])
|
w = aa[1].strip()
|
||||||
|
counts_recent[w] = int(aa[0])
|
||||||
|
if len(aa) > 2:
|
||||||
|
extras[w] = '\t'.join(aa[2:]).strip()
|
||||||
|
|
||||||
for key in counts_recent:
|
for key in counts_recent:
|
||||||
if not key in counts_reference:
|
if not key in counts_reference:
|
||||||
@@ -90,6 +97,11 @@ _, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh")
|
|||||||
for r, p_adj in zip(results, p_adjusted):
|
for r, p_adj in zip(results, p_adjusted):
|
||||||
r["p_g2_adjusted"] = p_adj
|
r["p_g2_adjusted"] = p_adj
|
||||||
|
|
||||||
|
for i in range(len(results)):
|
||||||
|
results[i]["n"] = int(counts_recent[results[i]["word"]])
|
||||||
|
if len(extras) > 0:
|
||||||
|
results[i]["extras"] = extras.get(results[i]["word"], '')
|
||||||
|
|
||||||
o = {}
|
o = {}
|
||||||
#o['up'] = sorted([x for x in results if x['p_g2'] < .05 and x['pct_diff'] > 0], key=lambda x: x['g2'], reverse=True)[:40]
|
#o['up'] = sorted([x for x in results if x['p_g2'] < .05 and x['pct_diff'] > 0], key=lambda x: x['g2'], reverse=True)[:40]
|
||||||
#o['dn'] = sorted([x for x in results if x['p_g2'] < .05 and x['pct_diff'] < 0], key=lambda x: x['g2'], reverse=True)[:40]
|
#o['dn'] = sorted([x for x in results if x['p_g2'] < .05 and x['pct_diff'] < 0], key=lambda x: x['g2'], reverse=True)[:40]
|
||||||
|
|||||||
Reference in New Issue
Block a user