diff --git a/collect.sh b/collect.sh index 0e8dbff..56f17f3 100755 --- a/collect.sh +++ b/collect.sh @@ -6,6 +6,20 @@ unset CDPATH PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH export TZ=Europe/Amsterdam +verbose=0 +if [ "$1" = "-v" ] +then + shift + verbose=1 +fi + +say () { + if [ "$verbose" = "1" ] + then + echo "$*" + fi +} + if [ "$1" = "" ] then ds=`ISOWeek -7` @@ -57,6 +71,7 @@ do # tellingen te voorkomen. # Dit speelt alleen(?) bij atom-feeds, zoals van de VRT. + say $part-nieuwe-namen-$ds-$i alto \ 'fp://node[((@cat="mwu" and node[@pt="spec"]) or (@pt and @*="eigen" and not(@rel="mwp"))) and not(@his="normal") and not(@his_1="decap" or @his_1="0")]' \ 'tt:%w\t%d\t%I' $files \ @@ -65,6 +80,7 @@ do | items2count > $part-nieuwe-namen-$ds-$i top20 $part-nieuwe-namen-$ds-$i + say $part-nieuwe-woorden-$ds-$i alto \ 'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \ 'tt:%w\t%d\t%I' $files \ @@ -73,6 +89,7 @@ do | items2count > $part-nieuwe-woorden-$ds-$i top20 $part-nieuwe-woorden-$ds-$i + say $part-locaties-$ds-$i alto \ 'fp://node[(@neclass="LOC" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="LOC"])]' \ 'tt:%l\t%d\t%I' $files \ @@ -80,6 +97,7 @@ do | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ | items2count > $part-locaties-$ds-$i + say $part-personen-$ds-$i alto \ 'fp://node[(@neclass="PER" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="PER"])]' \ 'tt:%l\t%d\t%I' $files \ @@ -87,6 +105,7 @@ do | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ | items2count > $part-personen-$ds-$i + say $part-organisaties-$ds-$i alto \ 'fp://node[(@neclass="ORG" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="ORG"])]' \ 'tt:%l\t%d\t%I' $files \ @@ -94,6 +113,7 @@ do | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ | items2count > $part-organisaties-$ds-$i + say $part-overige-namen-$ds-$i alto \ 'fp://node[(@neclass="MISC" and not(@rel="mwp")) or (@cat="mwu" and node[@pt="spec" and @neclass="MISC"])]' \ 'tt:%l\t%d\t%I' $files \ @@ -103,6 +123,7 @@ do # tellingen met postags + say $part-nieuwe-woorden-extra-$ds-$i alto \ 'fp://node[@his and not(@rel="mwp" or @cat="mwu") and not(@his="normal" or @his="name" or @his="prefix_name" or @his_1="decap" or @ his_1="0" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="within_word_conjunct")]' \ @@ -113,6 +134,7 @@ do | sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \ > $part-nieuwe-woorden-extra-$ds-$i + say $part-nieuwe-adjww-extra-$ds-$i alto \ 'fp://node[@pt and @his and not(../@his="normal" or @rel="mwp" or ../@his="name" or ../@his_1="decap") and not(@his="normal" or @his="name" or @his="skip" or @his="robust_skip" or @his="w_dia" or @his="wo_dia" or @his="decap" or @his="within_word_conjunct") and not(@pt="n") ]' \ 'tt:%w\t%P\t%I' $files \ diff --git a/www/mkAll.py b/www/mkAll.py index 297ce9a..61bd191 100755 --- a/www/mkAll.py +++ b/www/mkAll.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import sys -import os import re import subprocess @@ -37,6 +36,11 @@ namen = { 'Tzum': 'Literatuur' } +verbose=False +if sys.argv[1] == '-v': + verbose=True + sys.argv = sys.argv[:1] + sys.argv[2:] + ep=sys.argv[1] if not re.match('^2[0-9][0-9][0-9]-[0-5][0-9]$', ep): print("Ongeldig patroon '", ep, "', moet yyyy-ww zijn") @@ -47,23 +51,20 @@ week=ep[5:].lstrip('0') for base in ('algemeen', 'VRT', 'groningen', 'AT5', 'Tzum'): name = namen[base] - fp = open(name + '.html', 'wt', encoding='utf-8') - fp.write(head.format(name, jaar, week, name, jaar, week)) - fp.flush() - for part in ('nieuwe-namen', 'nieuwe-woorden', 'personen', 'overige-namen', 'locaties', 'organisaties'): - if part == 'locaties': - fp.write('\n