first commit
This commit is contained in:
65
RO/txt2corpus.sh
Executable file
65
RO/txt2corpus.sh
Executable file
@@ -0,0 +1,65 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`ISODate -7`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
echo INVALID
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
|
||||
corpus=/net/corpora/nlnieuws/RO/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/RO/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
then
|
||||
echo Getting lock failed
|
||||
exit 1
|
||||
fi
|
||||
|
||||
rm -fr out
|
||||
mkdir out
|
||||
|
||||
../../xml2txt $ds
|
||||
|
||||
rm -f $corpus.lines
|
||||
for i in out/*.txt
|
||||
do
|
||||
b=`basename $i .txt`
|
||||
perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \
|
||||
| perl -e '$n = 0; while(<>) { $n++; print("ro.'$b'.$n|$_"); }' \
|
||||
>> $corpus.lines
|
||||
done
|
||||
|
||||
cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata 2> err
|
||||
rm err
|
||||
|
||||
cd xml
|
||||
alto -o $corpus.data.dz *.xml 2> /dev/null
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
rm -f lock
|
||||
Reference in New Issue
Block a user