#!/bin/bash set -e unset CDPATH PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH export TZ=Europe/Amsterdam . /net/aps/etc/alpino-activate.sh > /dev/null if [ "$1" = "" ] then ds=`ISOWeek -7` else case "$1" in 2[0-9][0-9][0-9]-[0-5][0-9]) ds=$1 ;; *) echo INVALID exit 1 ;; esac fi dp=${ds//-//} corpus=/net/corpora/nlnieuws/AT5/corpus/$ds cd /net/corpora/nlnieuws/AT5/$dp ln -s lock.$$ lock if [ "`readlink lock`" != lock.$$ ] then echo Getting lock failed exit 1 fi rm -fr out mkdir out ../../xml2txt $ds rm -f $corpus.lines for i in out/*.txt do b=`basename $i .txt` perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \ | perl -e '$n = 0; while(<>) { $n++; print("at5.'$b'.$n|$_"); }' \ >> $corpus.lines done cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log ../../../metadata cd xml alto -o $corpus.data.dz *.xml 2> /dev/null cd ../.. rm -fr out rm -f lock