diff --git a/collect.sh b/collect.sh index 48932b7..6f5f3a1 100755 --- a/collect.sh +++ b/collect.sh @@ -3,8 +3,9 @@ set -e unset CDPATH -PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +PATH=/net/corpora/nlnieuws/bin:/net/corpora/nlnieuws/python:/net/aps/bin:$PATH export TZ=Europe/Amsterdam +. /net/corpora/nlnieuws/python/env/bin/activate verbose=0 if [ "$1" = "-v" ] @@ -35,6 +36,8 @@ else esac fi +ds1=$(weekadd $ds -1) + year=${ds%%.*} mkdir -p /net/corpora/nlnieuws/data/$year @@ -165,7 +168,7 @@ do # kale tellingen - say $part-count-word-$ds-$i + say $part-allewoorden-$ds-$i alto \ 'fp://node[(@pt and not(@pt="let" or @rel="mwp" or @neclass)) or (@cat="mwu" and not(.//node[@neclass]))]' \ 'tt:%l\t%I' $files \ @@ -173,49 +176,27 @@ do | sed -e 's/\t.*//' | uniq -c \ | grep -v '^ *1 ' \ | sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \ - > $part-count-word-$ds-$i - - say $part-count-loc-$ds-$i - alto \ - 'fp://node[(@neclass="LOC" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="LOC" ])]' \ - 'tt:%l\t%I' $files \ - | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ - | sed -e 's/\t.*//' | uniq -c \ - | grep -v '^ *1 ' \ - | sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \ - > $part-count-loc-$ds-$i - - say $part-count-per-$ds-$i - alto \ - 'fp://node[(@neclass="PER" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="PER" ])]' \ - 'tt:%l\t%I' $files \ - | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ - | sed -e 's/\t.*//' | uniq -c \ - | grep -v '^ *1 ' \ - | sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \ - > $part-count-per-$ds-$i - - say $part-count-org-$ds-$i - alto \ - 'fp://node[(@neclass="ORG" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="ORG" ])]' \ - 'tt:%l\t%I' $files \ - | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ - | sed -e 's/\t.*//' | uniq -c \ - | grep -v '^ *1 ' \ - | sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \ - > $part-count-org-$ds-$i - - say $part-count-misc-$ds-$i - alto \ - 'fp://node[(@neclass="MISC" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="MISC" ])]' \ - 'tt:%l\t%I' $files \ - | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \ - | sed -e 's/\t.*//' | uniq -c \ - | grep -v '^ *1 ' \ - | sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \ - > $part-count-misc-$ds-$i + > $part-allewoorden-$ds-$i done + + # score + say "$part-allewoorden-$ds1-4 $part-allewoorden-$ds-1 > $part-allewoorden-$ds.score14" + stijgers.py $part-allewoorden-$ds1-4 $part-allewoorden-$ds-1 > $part-allewoorden-$ds.score14 + + say "$part-locaties-$ds1-4 $part-locaties-$ds-1 > $part-locaties-$ds.score14" + stijgers.py $part-locaties-$ds1-4 $part-locaties-$ds-1 > $part-locaties-$ds.score14 + + say "$part-personen-$ds1-4 $part-personen-$ds-1 > $part-personen-$ds.score14" + stijgers.py $part-personen-$ds1-4 $part-personen-$ds-1 > $part-personen-$ds.score14 + + say "$part-organisaties-$ds1-4 $part-organisaties-$ds-1 > $part-organisaties-$ds.score14" + stijgers.py $part-organisaties-$ds1-4 $part-organisaties-$ds-1 > $part-organisaties-$ds.score14 + + say "$part-overige-namen-$ds1-4 $part-overige-namen-$ds-1 > $part-overige-namen-$ds.score14" + stijgers.py $part-overige-namen-$ds1-4 $part-overige-namen-$ds-1 > $part-overige-namen-$ds.score14 + + done data2json $ds 1 > ../json/$year/DATA-$ds-1.json diff --git a/python/stijgers.py b/python/stijgers.py index a9b6012..efe5b18 100755 --- a/python/stijgers.py +++ b/python/stijgers.py @@ -1,4 +1,4 @@ -#!/net/corpora/nlnieuws/notebook/bin/python3 +#!/usr/bin/env python3 import sys import numpy as np