#!/bin/bash set -e unset CDPATH PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH export TZ=Europe/Amsterdam . /net/aps/etc/alpino-activate.sh > /dev/null if [ "$1" = "" ] then # nieuws.nl gaat per dag, niet per week # dus gegevens van 2 dagen geleden, niet een week geleden ds=`date -d -2days +%Y-%m-%d` else case "$1" in 2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9]) ds=$1 ;; *) echo INVALID exit 1 ;; esac fi dp=${ds//-//} corpus=/net/corpora/nlnieuws/NieuwsNL/corpus/$ds cd /net/corpora/nlnieuws/NieuwsNL/$dp ln -s lock.$$ lock if [ "`readlink lock`" != lock.$$ ] then echo Getting lock failed exit 1 fi rm -fr out mkdir out rm -f $corpus.lines for i in *.txt do b=`basename $i .txt` perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \ | perl -e '$n = 0; while(<>) { $n++; print("nnl.'$b'.$n|$_"); }' \ >> $corpus.lines done cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log ../../../../metadata cd xml alto -o $corpus.data.dz *.xml 2> /dev/null # telling per bericht, niet per zin /net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt cd ../.. rm -fr out rm -f lock