Files
nlnieuws/Parool/txt2corpus.sh
Peter Kleiweg 5c651387af grote reorganisatie:
- HLN, NOS, NU, VRT: per week -> per dag
- yyyy-ww -> yyyy.ww
- yyyy*  -> yyyy/yyyy*
etc
2026-05-27 22:42:03 +02:00

72 lines
1.2 KiB
Bash
Executable File

#!/bin/bash
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/Parool
unset CDPATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
echo INVALID
exit 1
;;
esac
fi
year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
then
echo Getting lock failed
exit 1
fi
rm -fr out
mkdir out
rm -f $corpus.lines
for i in *.txt
do
b=`basename $i .txt`
perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \
| perl -e '$n = 0; while(<>) { $n++; print("parool.'$b'.$n|$_"); }' \
>> $corpus.lines
done
cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out
rm -f lock