#!/bin/bash set -e unset CDPATH PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH export TZ=Europe/Amsterdam . /net/aps/etc/alpino-activate.sh > /dev/null if [ "$1" = "" ] then ds=`date -d -7days +%G-%V` else case "$1" in 2[0-9][0-9][0-9]-[0-5][0-9]) ds=$1 ;; *) echo INVALID exit 1 ;; esac fi dp=${ds//-//} corpus=/net/corpora/nlnieuws/HLN/corpus/$ds cd /net/corpora/nlnieuws/HLN/$dp ln -s lock.$$ lock if [ "`readlink lock`" != lock.$$ ] then echo Getting lock failed exit 1 fi rm -fr out mkdir out rm -f $corpus.lines for i in *.txt do b=`basename $i .txt` perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \ | perl -e '$n = 0; while(<>) { $n++; print("hln.'$b'.$n|$_"); }' \ >> $corpus.lines done cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log ../../../metadata cd xml rm -f $corpus.data.dz $corpus.index alto -q -o $corpus.data.dz *.xml # telling per bericht, niet per zin /net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt cd ../.. rm -fr out rm -f lock