64 lines
1.0 KiB
Bash
Executable File
64 lines
1.0 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
set -e
|
|
|
|
unset CDPATH
|
|
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
|
export TZ=Europe/Amsterdam
|
|
. /net/aps/etc/alpino-activate.sh > /dev/null
|
|
|
|
if [ "$1" = "" ]
|
|
then
|
|
ds=`ISOWeek -7`
|
|
else
|
|
case "$1" in
|
|
2[0-9][0-9][0-9]-[0-5][0-9])
|
|
ds=$1
|
|
;;
|
|
*)
|
|
echo INVALID
|
|
exit 1
|
|
;;
|
|
esac
|
|
fi
|
|
|
|
dp=${ds//-//}
|
|
|
|
corpus=/net/corpora/nlnieuws/GG/corpus/$ds
|
|
|
|
cd /net/corpora/nlnieuws/GG/$dp
|
|
|
|
ln -s lock.$$ lock
|
|
if [ "`readlink lock`" != lock.$$ ]
|
|
then
|
|
echo Getting lock failed
|
|
exit 1
|
|
fi
|
|
|
|
rm -fr out
|
|
mkdir out
|
|
|
|
rm -f $corpus.lines
|
|
for i in *.txt
|
|
do
|
|
b=`basename $i .txt`
|
|
perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \
|
|
| perl -e '$n = 0; while(<>) { $n++; print("gg.'$b'.$n|$_"); }' \
|
|
>> $corpus.lines
|
|
done
|
|
|
|
cd out
|
|
mkdir xml
|
|
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
|
|
|
../../../metadata
|
|
|
|
cd xml
|
|
rm -f $corpus.data.dz $corpus.index
|
|
alto -q -o $corpus.data.dz *.xml
|
|
|
|
cd ../..
|
|
rm -fr out
|
|
|
|
rm -f lock
|