72 lines
1.2 KiB
Bash
Executable File
72 lines
1.2 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
set -e
|
|
|
|
BASE=/net/corpora/nlnieuws
|
|
PART=$BASE/Parool
|
|
|
|
unset CDPATH
|
|
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
|
export TZ=Europe/Amsterdam
|
|
. /net/aps/etc/alpino-activate.sh > /dev/null
|
|
|
|
if [ "$1" = "" ]
|
|
then
|
|
ds=`date -d -7days +%G.%V`
|
|
else
|
|
case "$1" in
|
|
2[0-9][0-9][0-9].[0-5][0-9])
|
|
ds=$1
|
|
;;
|
|
*)
|
|
echo INVALID
|
|
exit 1
|
|
;;
|
|
esac
|
|
fi
|
|
|
|
year=${ds%.*}
|
|
week=${ds#*.}
|
|
dp=$year/w$week
|
|
corpus=$PART/corpus/$year/$ds
|
|
mkdir -p $PART/corpus/$year
|
|
|
|
cd $PART/$dp
|
|
|
|
ln -s lock.$$ lock
|
|
if [ "`readlink lock`" != lock.$$ ]
|
|
then
|
|
echo Getting lock failed
|
|
exit 1
|
|
fi
|
|
|
|
rm -fr out
|
|
mkdir out
|
|
|
|
rm -f $corpus.lines
|
|
for i in *.txt
|
|
do
|
|
b=`basename $i .txt`
|
|
perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \
|
|
| perl -e '$n = 0; while(<>) { $n++; print("parool.'$b'.$n|$_"); }' \
|
|
>> $corpus.lines
|
|
done
|
|
|
|
cd out
|
|
mkdir xml
|
|
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
|
|
|
metadata
|
|
|
|
cd xml
|
|
rm -f $corpus.data.dz $corpus.index
|
|
alto -q -o $corpus.data.dz *.xml
|
|
|
|
# telling per bericht, niet per zin
|
|
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
|
|
|
cd ../..
|
|
rm -fr out
|
|
|
|
rm -f lock
|