grote reorganisatie:
- HLN, NOS, NU, VRT: per week -> per dag - yyyy-ww -> yyyy.ww - yyyy* -> yyyy/yyyy* etc
This commit is contained in:
@@ -2,17 +2,20 @@
|
||||
|
||||
set -e
|
||||
|
||||
BASE=/net/corpora/nlnieuws
|
||||
PART=$BASE/Parool
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
. /net/aps/etc/alpino-activate.sh > /dev/null
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G-%V`
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9]-[0-5][0-9])
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
@@ -22,11 +25,13 @@ else
|
||||
esac
|
||||
fi
|
||||
|
||||
dp=${ds//-//}
|
||||
year=${ds%.*}
|
||||
week=${ds#*.}
|
||||
dp=$year/w$week
|
||||
corpus=$PART/corpus/$year/$ds
|
||||
mkdir -p $PART/corpus/$year
|
||||
|
||||
corpus=/net/corpora/nlnieuws/Parool/corpus/$ds
|
||||
|
||||
cd /net/corpora/nlnieuws/Parool/$dp
|
||||
cd $PART/$dp
|
||||
|
||||
ln -s lock.$$ lock
|
||||
if [ "`readlink lock`" != lock.$$ ]
|
||||
@@ -51,14 +56,14 @@ cd out
|
||||
mkdir xml
|
||||
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
|
||||
|
||||
../../../metadata
|
||||
metadata
|
||||
|
||||
cd xml
|
||||
rm -f $corpus.data.dz $corpus.index
|
||||
alto -q -o $corpus.data.dz *.xml
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
Reference in New Issue
Block a user