grote reorganisatie:

- HLN, NOS, NU, VRT: per week -> per dag
- yyyy-ww -> yyyy.ww
- yyyy*  -> yyyy/yyyy*
etc
This commit is contained in:
Peter Kleiweg
2026-05-27 22:42:03 +02:00
parent e430ff576b
commit 5c651387af
46 changed files with 328 additions and 227 deletions

View File

@@ -2,8 +2,11 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/NieuwsNL
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
@@ -25,10 +28,11 @@ else
fi
dp=${ds//-//}
year=${ds%%-*}
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/NieuwsNL/corpus/$ds
cd /net/corpora/nlnieuws/NieuwsNL/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -53,14 +57,14 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out