#!/bin/bash set -e unset CDPATH PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH export TZ=Europe/Amsterdam verbose=0 if [ "$1" = "-v" ] then shift verbose=1 fi say () { if [ "$verbose" = "1" ] then echo "$*" fi } if [ "$1" = "" ] then ds=`date -d -7days +%G.%V` else case "$1" in 2[0-9][0-9][0-9].[0-5][0-9]) ds=$1 ;; *) echo INVALID exit 1 ;; esac fi year=${ds%%.*} mkdir -p /net/corpora/nlnieuws/data/$year mkdir -p /net/corpora/nlnieuws/data/json/$year cd /net/corpora/nlnieuws/data/$year declare -A parts #parts[alles]='.' parts[algemeen]='NOS|NU|NieuwsNL|RO|Sargasso|Volkskrant' #parts[amsterdam]='AT5|BuurtAdam|Parool' #parts[groningen]='BuurtGrn|GG|Oog|RTVNoord|Sikkom' #parts[literatuur]='LitNL|Tzum' #parts[vlaanderen]='HLN|VRT' #parts[AT5]='AT5' #parts[BuurtAdam]='BuurtAdam' #parts[BuurtGrn]='BuurtGrn' #parts[GG]='GG' #parts[HLN]='HLN' #parts[LitNL]='LitNL' #parts[NOS]='NOS' #parts[NU]='NU' #parts[NieuwsNL]='NieuwsNL' #parts[Oog]='Oog' #parts[Parool]='Parool' #parts[RO]='RO' #parts[RTVNoord]='RTVNoord' #parts[Sargasso]='Sargasso' #parts[Sikkom]='Sikkom' #parts[Tzum]='Tzum' #parts[Volkskrant]='Volkskrant' #parts[VRT]='VRT' for part in ${!parts[@]} do regex=${parts[$part]} for i in 1 4 do files=$(find ../.. $(week2files $ds $i) | grep -E "$regex") || true if [ -z "$files" ] then continue fi say $part-rang-word-$ds-$i alto \ 'fp://node[(@pt and not(@pt="let" or @rel="mwp" or @neclass)) or (@cat="mwu" and not(.//node[@neclass]))]' \ 'tt:%l\t%I' $files \ | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \ > $part-rang-word-$ds-$i say $part-rang-loc-$ds-$i alto \ 'fp://node[(@neclass="LOC" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="LOC" ])]' \ 'tt:%l\t%I' $files \ | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \ > $part-rang-loc-$ds-$i say $part-rang-per-$ds-$i alto \ 'fp://node[(@neclass="PER" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="PER" ])]' \ 'tt:%l\t%I' $files \ | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \ > $part-rang-per-$ds-$i say $part-rang-org-$ds-$i alto \ 'fp://node[(@neclass="ORG" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="ORG" ])]' \ 'tt:%l\t%I' $files \ | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \ > $part-rang-org-$ds-$i say $part-rang-misc-$ds-$i alto \ 'fp://node[(@neclass="MISC" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="MISC" ])]' \ 'tt:%l\t%I' $files \ | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \ > $part-rang-misc-$ds-$i done done