update
This commit is contained in:
70
Volkskrant/txt2corpus.sh
Executable file
70
Volkskrant/txt2corpus.sh
Executable file
@@ -0,0 +1,70 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/Volkskrant
|
||||
|
||||
unset CDPATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -2days +%Y-%m-%d`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
echo INVALID
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%%-*}
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
then
|
||||
echo Getting lock failed
|
||||
exit 1
|
||||
fi
|
||||
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in *.txt
|
||||
do
|
||||
b=`basename $i .txt`
|
||||
perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \
|
||||
| perl -e '$n = 0; while(<>) { $n++; print("vk.'$b'.$n|$_"); }' \
|
||||
>> $corpus.lines
|
||||
done
|
||||
|
||||
cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
rm -f lock
|
||||
Reference in New Issue
Block a user