From 5c651387af4cb6673d10363d59727d2d87922c92 Mon Sep 17 00:00:00 2001 From: Peter Kleiweg Date: Wed, 27 May 2026 22:42:03 +0200 Subject: [PATCH] grote reorganisatie: - HLN, NOS, NU, VRT: per week -> per dag - yyyy-ww -> yyyy.ww - yyyy* -> yyyy/yyyy* etc --- AT5/cmd/at5/at5.go | 2 +- AT5/cmd/xml2txt/xml2txt.go | 8 ++++---- AT5/txt2corpus.sh | 23 ++++++++++++++--------- BuurtAdam/cmd/buurtadam/buurtadam.go | 2 +- BuurtAdam/txt2corpus.sh | 23 ++++++++++++++--------- BuurtGrn/cmd/buurtgrn/buurtgrn.go | 2 +- BuurtGrn/txt2corpus.sh | 23 ++++++++++++++--------- GG/cmd/gg/gg.go | 2 +- GG/txt2corpus.sh | 21 +++++++++++++-------- HLN/cmd/hln/hln.go | 3 +-- HLN/txt2corpus.sh | 20 ++++++++++++-------- LitNL/cmd/litnl/litnl.go | 2 +- LitNL/cmd/xml2txt/xml2txt.go | 8 ++++---- LitNL/txt2corpus.sh | 25 +++++++++++++++---------- NOS/cmd/json2txt/json2txt.go | 10 +++++----- NOS/cmd/nos/nos.go | 3 +-- NOS/txt2corpus.sh | 24 ++++++++++++++---------- NU/cmd/nu/nu.go | 3 +-- NU/txt2corpus.sh | 20 ++++++++++++-------- NieuwsNL/txt2corpus.sh | 16 ++++++++++------ Oog/cmd/oog/oog.go | 2 +- Oog/cmd/xml2txt/xml2txt.go | 8 ++++---- Oog/txt2corpus.sh | 25 +++++++++++++++---------- Parool/cmd/parool/parool.go | 2 +- Parool/txt2corpus.sh | 23 ++++++++++++++--------- RO/cmd/ro/ro.go | 2 +- RO/cmd/xml2txt/xml2txt.go | 8 ++++---- RO/txt2corpus.sh | 25 +++++++++++++++---------- RTVNoord/cmd/rtvnoord/rtvnoord.go | 2 +- RTVNoord/txt2corpus.sh | 25 +++++++++++++++---------- Sargasso/cmd/sargasso/sargasso.go | 2 +- Sargasso/cmd/xml2txt/xml2txt.go | 8 ++++---- Sargasso/txt2corpus.sh | 25 +++++++++++++++---------- Sikkom/cmd/sikkom/sikkom.go | 2 +- Sikkom/txt2corpus.sh | 21 +++++++++++++-------- Tzum/cmd/tzum/tzum.go | 2 +- Tzum/cmd/xml2txt/xml2txt.go | 8 ++++---- Tzum/txt2corpus.sh | 25 +++++++++++++++---------- VRT/cmd/vrt/vrt.go | 3 +-- VRT/txt2corpus.sh | 22 +++++++++++++--------- cmd/data2json/data2json.go | 9 +++++---- cmd/dates2json/dates2json.go | 21 ++++++++++++++++----- cmd/week2files/week2files.go | 6 +++--- collect.sh | 20 ++++++++++++-------- internal/util/util.go | 7 +++++-- www/app.js | 12 +++++++++--- 46 files changed, 328 insertions(+), 227 deletions(-) diff --git a/AT5/cmd/at5/at5.go b/AT5/cmd/at5/at5.go index 9f739f1..fa7c2a4 100644 --- a/AT5/cmd/at5/at5.go +++ b/AT5/cmd/at5/at5.go @@ -77,7 +77,7 @@ func main() { } p(err) year, week := t.ISOWeek() - dirname := fmt.Sprintf("/net/corpora/nlnieuws/AT5/%d/%02d", year, week) + dirname := fmt.Sprintf("/net/corpora/nlnieuws/AT5/%d/w%02d", year, week) if exists(dirname + "/lock") { continue } diff --git a/AT5/cmd/xml2txt/xml2txt.go b/AT5/cmd/xml2txt/xml2txt.go index f249581..3f26994 100644 --- a/AT5/cmd/xml2txt/xml2txt.go +++ b/AT5/cmd/xml2txt/xml2txt.go @@ -22,7 +22,7 @@ type Item struct { var ( x = e.ExitErr - reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`) + reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`) ) func main() { @@ -31,16 +31,16 @@ func main() { switch len(os.Args) { case 1: year, week := time.Now().AddDate(0, 0, -7).ISOWeek() - ds = fmt.Sprintf("%d-%02d", year, week) + ds = fmt.Sprintf("%d.%02d", year, week) case 2: if !reYearWeek.MatchString(os.Args[1]) { - x(fmt.Errorf("arg must be yyyy-ww")) + x(fmt.Errorf("arg must be yyyy.ww")) } ds = os.Args[1] default: x(fmt.Errorf("too many arguments")) } - dp := ds[:4] + "/" + ds[5:] + dp := ds[:4] + "/w" + ds[5:] x(os.Chdir("/net/corpora/nlnieuws/AT5/" + dp)) x(os.MkdirAll("out", 0777)) diff --git a/AT5/txt2corpus.sh b/AT5/txt2corpus.sh index dbc13c9..def7e22 100755 --- a/AT5/txt2corpus.sh +++ b/AT5/txt2corpus.sh @@ -2,17 +2,20 @@ set -e +BASE=/net/corpora/nlnieuws +PART=$BASE/AT5 + unset CDPATH -PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH export TZ=Europe/Amsterdam . /net/aps/etc/alpino-activate.sh > /dev/null if [ "$1" = "" ] then - ds=`date -d -7days +%G-%V` + ds=`date -d -7days +%G.%V` else case "$1" in - 2[0-9][0-9][0-9]-[0-5][0-9]) + 2[0-9][0-9][0-9].[0-5][0-9]) ds=$1 ;; *) @@ -22,11 +25,13 @@ else esac fi -dp=${ds//-//} +year=${ds%.*} +week=${ds#*.} +dp=$year/w$week +corpus=$PART/corpus/$year/$ds +mkdir -p $PART/corpus/$year -corpus=/net/corpora/nlnieuws/AT5/corpus/$ds - -cd /net/corpora/nlnieuws/AT5/$dp +cd $PART/$dp ln -s lock.$$ lock if [ "`readlink lock`" != lock.$$ ] @@ -38,7 +43,7 @@ fi rm -fr out mkdir out -../../xml2txt $ds +xml2txt $ds rm -f $corpus.lines for i in out/*.txt @@ -53,7 +58,7 @@ cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log -../../../metadata +metadata cd xml rm -f $corpus.data.dz $corpus.index diff --git a/BuurtAdam/cmd/buurtadam/buurtadam.go b/BuurtAdam/cmd/buurtadam/buurtadam.go index 77ce869..8d7dee5 100644 --- a/BuurtAdam/cmd/buurtadam/buurtadam.go +++ b/BuurtAdam/cmd/buurtadam/buurtadam.go @@ -95,7 +95,7 @@ func main() { } p(err) year, week := t.ISOWeek() - dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtAdam/%d/%02d", year, week) + dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtAdam/%d/w%02d", year, week) if exists(dirname + "/lock") { continue } diff --git a/BuurtAdam/txt2corpus.sh b/BuurtAdam/txt2corpus.sh index 15a69b8..b4e2fc7 100755 --- a/BuurtAdam/txt2corpus.sh +++ b/BuurtAdam/txt2corpus.sh @@ -2,17 +2,20 @@ set -e +BASE=/net/corpora/nlnieuws +PART=$BASE/BuurtAdam + unset CDPATH -PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH export TZ=Europe/Amsterdam . /net/aps/etc/alpino-activate.sh > /dev/null if [ "$1" = "" ] then - ds=`date -d -7days +%G-%V` + ds=`date -d -7days +%G.%V` else case "$1" in - 2[0-9][0-9][0-9]-[0-5][0-9]) + 2[0-9][0-9][0-9].[0-5][0-9]) ds=$1 ;; *) @@ -22,11 +25,13 @@ else esac fi -dp=${ds//-//} +year=${ds%.*} +week=${ds#*.} +dp=$year/w$week +corpus=$PART/corpus/$year/$ds +mkdir -p $PART/corpus/$year -corpus=/net/corpora/nlnieuws/BuurtAdam/corpus/$ds - -cd /net/corpora/nlnieuws/BuurtAdam/$dp +cd $PART/$dp ln -s lock.$$ lock if [ "`readlink lock`" != lock.$$ ] @@ -51,14 +56,14 @@ cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log -../../../metadata +metadata cd xml rm -f $corpus.data.dz $corpus.index alto -q -o $corpus.data.dz *.xml # telling per bericht, niet per zin -/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt +query.sh -x T -s $corpus.data.dz > $corpus.tag.txt cd ../.. rm -fr out diff --git a/BuurtGrn/cmd/buurtgrn/buurtgrn.go b/BuurtGrn/cmd/buurtgrn/buurtgrn.go index 58b3062..3e27ac8 100644 --- a/BuurtGrn/cmd/buurtgrn/buurtgrn.go +++ b/BuurtGrn/cmd/buurtgrn/buurtgrn.go @@ -94,7 +94,7 @@ func main() { } p(err) year, week := t.ISOWeek() - dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtGrn/%d/%02d", year, week) + dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtGrn/%d/w%02d", year, week) if exists(dirname + "/lock") { continue } diff --git a/BuurtGrn/txt2corpus.sh b/BuurtGrn/txt2corpus.sh index 50df2d2..d3b3677 100755 --- a/BuurtGrn/txt2corpus.sh +++ b/BuurtGrn/txt2corpus.sh @@ -2,17 +2,20 @@ set -e +BASE=/net/corpora/nlnieuws +PART=$BASE/BuurtGrn + unset CDPATH -PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH export TZ=Europe/Amsterdam . /net/aps/etc/alpino-activate.sh > /dev/null if [ "$1" = "" ] then - ds=`date -d -7days +%G-%V` + ds=`date -d -7days +%G.%V` else case "$1" in - 2[0-9][0-9][0-9]-[0-5][0-9]) + 2[0-9][0-9][0-9].[0-5][0-9]) ds=$1 ;; *) @@ -22,11 +25,13 @@ else esac fi -dp=${ds//-//} +year=${ds%.*} +week=${ds#*.} +dp=$year/w$week +corpus=$PART/corpus/$year/$ds +mkdir -p $PART/corpus/$year -corpus=/net/corpora/nlnieuws/BuurtGrn/corpus/$ds - -cd /net/corpora/nlnieuws/BuurtGrn/$dp +cd $PART/$dp ln -s lock.$$ lock if [ "`readlink lock`" != lock.$$ ] @@ -51,14 +56,14 @@ cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log -../../../metadata +metadata cd xml rm -f $corpus.data.dz $corpus.index alto -q -o $corpus.data.dz *.xml # telling per bericht, niet per zin -/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt +query.sh -x T -s $corpus.data.dz > $corpus.tag.txt cd ../.. rm -fr out diff --git a/GG/cmd/gg/gg.go b/GG/cmd/gg/gg.go index 72a3b4b..1a8c19a 100644 --- a/GG/cmd/gg/gg.go +++ b/GG/cmd/gg/gg.go @@ -95,7 +95,7 @@ func main() { } p(err) year, week := t.ISOWeek() - dirname := fmt.Sprintf("/net/corpora/nlnieuws/GG/%d/%02d", year, week) + dirname := fmt.Sprintf("/net/corpora/nlnieuws/GG/%d/w%02d", year, week) if exists(dirname + "/lock") { continue } diff --git a/GG/txt2corpus.sh b/GG/txt2corpus.sh index cfae1d9..e2e44c6 100755 --- a/GG/txt2corpus.sh +++ b/GG/txt2corpus.sh @@ -2,17 +2,20 @@ set -e +BASE=/net/corpora/nlnieuws +PART=$BASE/GG + unset CDPATH -PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH export TZ=Europe/Amsterdam . /net/aps/etc/alpino-activate.sh > /dev/null if [ "$1" = "" ] then - ds=`date -d -7days +%G-%V` + ds=`date -d -7days +%G.%V` else case "$1" in - 2[0-9][0-9][0-9]-[0-5][0-9]) + 2[0-9][0-9][0-9].[0-5][0-9]) ds=$1 ;; *) @@ -22,11 +25,13 @@ else esac fi -dp=${ds//-//} +year=${ds%.*} +week=${ds#*.} +dp=$year/w$week +corpus=$PART/corpus/$year/$ds +mkdir -p $PART/corpus/$year -corpus=/net/corpora/nlnieuws/GG/corpus/$ds - -cd /net/corpora/nlnieuws/GG/$dp +cd $PART/$dp ln -s lock.$$ lock if [ "`readlink lock`" != lock.$$ ] @@ -51,7 +56,7 @@ cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log -../../../metadata +metadata cd xml rm -f $corpus.data.dz $corpus.index diff --git a/HLN/cmd/hln/hln.go b/HLN/cmd/hln/hln.go index bdfefce..30bc851 100644 --- a/HLN/cmd/hln/hln.go +++ b/HLN/cmd/hln/hln.go @@ -101,8 +101,7 @@ func main() { t, err = time.Parse(time.RFC1123, item.PubDate) } p(err) - year, week := t.ISOWeek() - dirname := fmt.Sprintf("/net/corpora/nlnieuws/HLN/%d/%02d", year, week) + dirname := fmt.Sprintf("/net/corpora/nlnieuws/HLN/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day()) if exists(dirname + "/lock") { continue } diff --git a/HLN/txt2corpus.sh b/HLN/txt2corpus.sh index f05ce90..e85f206 100755 --- a/HLN/txt2corpus.sh +++ b/HLN/txt2corpus.sh @@ -2,17 +2,20 @@ set -e +BASE=/net/corpora/nlnieuws +PART=$BASE/HLN + unset CDPATH -PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH export TZ=Europe/Amsterdam . /net/aps/etc/alpino-activate.sh > /dev/null if [ "$1" = "" ] then - ds=`date -d -7days +%G-%V` + ds=`date -d -2days +%Y-%m-%d` else case "$1" in - 2[0-9][0-9][0-9]-[0-5][0-9]) + 2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9]) ds=$1 ;; *) @@ -23,10 +26,11 @@ else fi dp=${ds//-//} +year=${ds%%-*} +corpus=$PART/corpus/$year/$ds +mkdir -p $PART/corpus/$year -corpus=/net/corpora/nlnieuws/HLN/corpus/$ds - -cd /net/corpora/nlnieuws/HLN/$dp +cd $PART/$dp ln -s lock.$$ lock if [ "`readlink lock`" != lock.$$ ] @@ -51,14 +55,14 @@ cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log -../../../metadata +metadata cd xml rm -f $corpus.data.dz $corpus.index alto -q -o $corpus.data.dz *.xml # telling per bericht, niet per zin -/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt +query.sh -x T -s $corpus.data.dz > $corpus.tag.txt cd ../.. rm -fr out diff --git a/LitNL/cmd/litnl/litnl.go b/LitNL/cmd/litnl/litnl.go index afa1a0e..1d9dc1b 100644 --- a/LitNL/cmd/litnl/litnl.go +++ b/LitNL/cmd/litnl/litnl.go @@ -77,7 +77,7 @@ func main() { } p(err) year, week := t.ISOWeek() - dirname := fmt.Sprintf("/net/corpora/nlnieuws/LitNL/%d/%02d", year, week) + dirname := fmt.Sprintf("/net/corpora/nlnieuws/LitNL/%d/w%02d", year, week) if exists(dirname + "/lock") { continue } diff --git a/LitNL/cmd/xml2txt/xml2txt.go b/LitNL/cmd/xml2txt/xml2txt.go index 777071a..45ca364 100644 --- a/LitNL/cmd/xml2txt/xml2txt.go +++ b/LitNL/cmd/xml2txt/xml2txt.go @@ -24,7 +24,7 @@ var ( w = e.WarnErr x = e.ExitErr - reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`) + reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`) ) func main() { @@ -33,16 +33,16 @@ func main() { switch len(os.Args) { case 1: year, week := time.Now().AddDate(0, 0, -7).ISOWeek() - ds = fmt.Sprintf("%d-%02d", year, week) + ds = fmt.Sprintf("%d.%02d", year, week) case 2: if !reYearWeek.MatchString(os.Args[1]) { - x(fmt.Errorf("arg must be yyyy-ww")) + x(fmt.Errorf("arg must be yyyy.ww")) } ds = os.Args[1] default: x(fmt.Errorf("too many arguments")) } - dp := ds[:4] + "/" + ds[5:] + dp := ds[:4] + "/w" + ds[5:] x(os.Chdir("/net/corpora/nlnieuws/LitNL/" + dp)) x(os.MkdirAll("out", 0777)) diff --git a/LitNL/txt2corpus.sh b/LitNL/txt2corpus.sh index 2e190c5..4a31901 100755 --- a/LitNL/txt2corpus.sh +++ b/LitNL/txt2corpus.sh @@ -2,17 +2,20 @@ set -e +BASE=/net/corpora/nlnieuws +PART=$BASE/LitNL + unset CDPATH -PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH export TZ=Europe/Amsterdam . /net/aps/etc/alpino-activate.sh > /dev/null if [ "$1" = "" ] then - ds=`date -d -7days +%G-%V` + ds=`date -d -7days +%G.%V` else case "$1" in - 2[0-9][0-9][0-9]-[0-5][0-9]) + 2[0-9][0-9][0-9].[0-5][0-9]) ds=$1 ;; *) @@ -22,11 +25,13 @@ else esac fi -dp=${ds//-//} +year=${ds%.*} +week=${ds#*.} +dp=$year/w$week +corpus=$PART/corpus/$year/$ds +mkdir -p $PART/corpus/$year -corpus=/net/corpora/nlnieuws/LitNL/corpus/$ds - -cd /net/corpora/nlnieuws/LitNL/$dp +cd $PART/$dp ln -s lock.$$ lock if [ "`readlink lock`" != lock.$$ ] @@ -38,7 +43,7 @@ fi rm -fr out mkdir out -../../xml2txt $ds +xml2txt $ds rm -f $corpus.lines for i in out/*.txt @@ -53,14 +58,14 @@ cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log -../../../metadata +metadata cd xml rm -f $corpus.data.dz $corpus.index alto -q -o $corpus.data.dz *.xml # telling per bericht, niet per zin -/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt +query.sh -x T -s $corpus.data.dz > $corpus.tag.txt cd ../.. rm -fr out diff --git a/NOS/cmd/json2txt/json2txt.go b/NOS/cmd/json2txt/json2txt.go index 8c9e0b1..36e377c 100644 --- a/NOS/cmd/json2txt/json2txt.go +++ b/NOS/cmd/json2txt/json2txt.go @@ -28,7 +28,7 @@ type Item struct { var ( x = e.ExitErr - reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`) + reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9]$`) ) func main() { @@ -36,17 +36,17 @@ func main() { var ds string switch len(os.Args) { case 1: - year, week := time.Now().AddDate(0, 0, -7).ISOWeek() - ds = fmt.Sprintf("%d-%02d", year, week) + t := time.Now().AddDate(0, 0, -2) + ds = fmt.Sprintf("%d-%02d-%02d", t.Year(), int(t.Month()), t.Day()) case 2: if !reYearWeek.MatchString(os.Args[1]) { - x(fmt.Errorf("arg must be yyyy-ww")) + x(fmt.Errorf("arg must be yyyy-mm-dd")) } ds = os.Args[1] default: x(fmt.Errorf("too many arguments")) } - dp := ds[:4] + "/" + ds[5:] + dp := strings.ReplaceAll(ds, "-", "/") x(os.Chdir("/net/corpora/nlnieuws/NOS/" + dp)) x(os.MkdirAll("out", 0777)) diff --git a/NOS/cmd/nos/nos.go b/NOS/cmd/nos/nos.go index 903fc8a..d56b51d 100644 --- a/NOS/cmd/nos/nos.go +++ b/NOS/cmd/nos/nos.go @@ -94,8 +94,7 @@ func main() { } } p(err) - year, week := t.ISOWeek() - dirname := fmt.Sprintf("/net/corpora/nlnieuws/NOS/%d/%02d", year, week) + dirname := fmt.Sprintf("/net/corpora/nlnieuws/NOS/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day()) if exists(dirname + "/lock") { continue } diff --git a/NOS/txt2corpus.sh b/NOS/txt2corpus.sh index 0b96fa3..77a501a 100755 --- a/NOS/txt2corpus.sh +++ b/NOS/txt2corpus.sh @@ -2,17 +2,20 @@ set -e +BASE=/net/corpora/nlnieuws +PART=$BASE/NOS + unset CDPATH -PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH export TZ=Europe/Amsterdam . /net/aps/etc/alpino-activate.sh > /dev/null if [ "$1" = "" ] then - ds=`date -d -7days +%G-%V` + ds=`date -d -2days +%Y-%m-%d` else case "$1" in - 2[0-9][0-9][0-9]-[0-5][0-9]) + 2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9]) ds=$1 ;; *) @@ -23,10 +26,11 @@ else fi dp=${ds//-//} +year=${ds%%-*} +corpus=$PART/corpus/$year/$ds +mkdir -p $PART/corpus/$year -corpus=/net/corpora/nlnieuws/NOS/corpus/$ds - -cd /net/corpora/nlnieuws/NOS/$dp +cd $PART/$dp ln -s lock.$$ lock if [ "`readlink lock`" != lock.$$ ] @@ -38,7 +42,7 @@ fi rm -fr out mkdir out -../../json2txt $ds +json2txt $ds rm -f $corpus.lines for i in out/*.txt @@ -53,15 +57,15 @@ cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log -../../../metadata +metadata cd xml rm -f $corpus.data.dz $corpus.index alto -q -o $corpus.data.dz *.xml # telling per bericht, niet per zin -/net/corpora/nlnieuws/namen.sh -x C -s $corpus.data.dz > $corpus.cat.txt -/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt +query.sh -x C -s $corpus.data.dz > $corpus.cat.txt +query.sh -x T -s $corpus.data.dz > $corpus.tag.txt cd ../.. rm -fr out diff --git a/NU/cmd/nu/nu.go b/NU/cmd/nu/nu.go index 3ba3c8e..de469a5 100644 --- a/NU/cmd/nu/nu.go +++ b/NU/cmd/nu/nu.go @@ -98,8 +98,7 @@ func main() { t, err = time.Parse(time.RFC1123, item.PubDate) } p(err) - year, week := t.ISOWeek() - dirname := fmt.Sprintf("/net/corpora/nlnieuws/NU/%d/%02d", year, week) + dirname := fmt.Sprintf("/net/corpora/nlnieuws/NU/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day()) if exists(dirname + "/lock") { continue } diff --git a/NU/txt2corpus.sh b/NU/txt2corpus.sh index e141d35..ab097cf 100755 --- a/NU/txt2corpus.sh +++ b/NU/txt2corpus.sh @@ -2,17 +2,20 @@ set -e +BASE=/net/corpora/nlnieuws +PART=$BASE/NU + unset CDPATH -PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH export TZ=Europe/Amsterdam . /net/aps/etc/alpino-activate.sh > /dev/null if [ "$1" = "" ] then - ds=`date -d -7days +%G-%V` + ds=`date -d -2days +%Y-%m-%d` else case "$1" in - 2[0-9][0-9][0-9]-[0-5][0-9]) + 2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9]) ds=$1 ;; *) @@ -23,10 +26,11 @@ else fi dp=${ds//-//} +year=${ds%%-*} +corpus=$PART/corpus/$year/$ds +mkdir -p $PART/corpus/$year -corpus=/net/corpora/nlnieuws/NU/corpus/$ds - -cd /net/corpora/nlnieuws/NU/$dp +cd $PART/$dp ln -s lock.$$ lock if [ "`readlink lock`" != lock.$$ ] @@ -51,14 +55,14 @@ cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log -../../../metadata +metadata cd xml rm -f $corpus.data.dz $corpus.index alto -q -o $corpus.data.dz *.xml # telling per bericht, niet per zin -/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt +query.sh -x T -s $corpus.data.dz > $corpus.tag.txt cd ../.. rm -fr out diff --git a/NieuwsNL/txt2corpus.sh b/NieuwsNL/txt2corpus.sh index c41fbb2..5401c3f 100755 --- a/NieuwsNL/txt2corpus.sh +++ b/NieuwsNL/txt2corpus.sh @@ -2,8 +2,11 @@ set -e +BASE=/net/corpora/nlnieuws +PART=$BASE/NieuwsNL + unset CDPATH -PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH export TZ=Europe/Amsterdam . /net/aps/etc/alpino-activate.sh > /dev/null @@ -25,10 +28,11 @@ else fi dp=${ds//-//} +year=${ds%%-*} +corpus=$PART/corpus/$year/$ds +mkdir -p $PART/corpus/$year -corpus=/net/corpora/nlnieuws/NieuwsNL/corpus/$ds - -cd /net/corpora/nlnieuws/NieuwsNL/$dp +cd $PART/$dp ln -s lock.$$ lock if [ "`readlink lock`" != lock.$$ ] @@ -53,14 +57,14 @@ cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log -../../../../metadata +metadata cd xml rm -f $corpus.data.dz $corpus.index alto -q -o $corpus.data.dz *.xml # telling per bericht, niet per zin -/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt +query.sh -x T -s $corpus.data.dz > $corpus.tag.txt cd ../.. rm -fr out diff --git a/Oog/cmd/oog/oog.go b/Oog/cmd/oog/oog.go index 5d2ef4c..456e9f3 100644 --- a/Oog/cmd/oog/oog.go +++ b/Oog/cmd/oog/oog.go @@ -77,7 +77,7 @@ func main() { } p(err) year, week := t.ISOWeek() - dirname := fmt.Sprintf("/net/corpora/nlnieuws/Oog/%d/%02d", year, week) + dirname := fmt.Sprintf("/net/corpora/nlnieuws/Oog/%d/w%02d", year, week) if exists(dirname + "/lock") { continue } diff --git a/Oog/cmd/xml2txt/xml2txt.go b/Oog/cmd/xml2txt/xml2txt.go index 8a8f171..d84e686 100644 --- a/Oog/cmd/xml2txt/xml2txt.go +++ b/Oog/cmd/xml2txt/xml2txt.go @@ -23,7 +23,7 @@ type Item struct { var ( x = e.ExitErr - reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`) + reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`) ) func main() { @@ -32,16 +32,16 @@ func main() { switch len(os.Args) { case 1: year, week := time.Now().AddDate(0, 0, -7).ISOWeek() - ds = fmt.Sprintf("%d-%02d", year, week) + ds = fmt.Sprintf("%d.%02d", year, week) case 2: if !reYearWeek.MatchString(os.Args[1]) { - x(fmt.Errorf("arg must be yyyy-ww")) + x(fmt.Errorf("arg must be yyyy.ww")) } ds = os.Args[1] default: x(fmt.Errorf("too many arguments")) } - dp := ds[:4] + "/" + ds[5:] + dp := ds[:4] + "/w" + ds[5:] x(os.Chdir("/net/corpora/nlnieuws/Oog/" + dp)) x(os.MkdirAll("out", 0777)) diff --git a/Oog/txt2corpus.sh b/Oog/txt2corpus.sh index bf8ecc0..1d80dcf 100755 --- a/Oog/txt2corpus.sh +++ b/Oog/txt2corpus.sh @@ -2,17 +2,20 @@ set -e +BASE=/net/corpora/nlnieuws +PART=$BASE/Oog + unset CDPATH -PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH export TZ=Europe/Amsterdam . /net/aps/etc/alpino-activate.sh > /dev/null if [ "$1" = "" ] then - ds=`date -d -7days +%G-%V` + ds=`date -d -7days +%G.%V` else case "$1" in - 2[0-9][0-9][0-9]-[0-5][0-9]) + 2[0-9][0-9][0-9].[0-5][0-9]) ds=$1 ;; *) @@ -22,11 +25,13 @@ else esac fi -dp=${ds//-//} +year=${ds%.*} +week=${ds#*.} +dp=$year/w$week +corpus=$PART/corpus/$year/$ds +mkdir -p $PART/corpus/$year -corpus=/net/corpora/nlnieuws/Oog/corpus/$ds - -cd /net/corpora/nlnieuws/Oog/$dp +cd $PART/$dp ln -s lock.$$ lock if [ "`readlink lock`" != lock.$$ ] @@ -38,7 +43,7 @@ fi rm -fr out mkdir out -../../xml2txt $ds +xml2txt $ds rm -f $corpus.lines for i in out/*.txt @@ -53,14 +58,14 @@ cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log -../../../metadata +metadata cd xml rm -f $corpus.data.dz $corpus.index alto -q -o $corpus.data.dz *.xml # telling per bericht, niet per zin -/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt +query.sh -x T -s $corpus.data.dz > $corpus.tag.txt cd ../.. rm -fr out diff --git a/Parool/cmd/parool/parool.go b/Parool/cmd/parool/parool.go index 18aeb31..4f5aa9a 100644 --- a/Parool/cmd/parool/parool.go +++ b/Parool/cmd/parool/parool.go @@ -102,7 +102,7 @@ func main() { } p(err) year, week := t.ISOWeek() - dirname := fmt.Sprintf("/net/corpora/nlnieuws/Parool/%d/%02d", year, week) + dirname := fmt.Sprintf("/net/corpora/nlnieuws/Parool/%d/w%02d", year, week) if exists(dirname + "/lock") { continue } diff --git a/Parool/txt2corpus.sh b/Parool/txt2corpus.sh index 46b0e62..2bd5bf3 100755 --- a/Parool/txt2corpus.sh +++ b/Parool/txt2corpus.sh @@ -2,17 +2,20 @@ set -e +BASE=/net/corpora/nlnieuws +PART=$BASE/Parool + unset CDPATH -PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH export TZ=Europe/Amsterdam . /net/aps/etc/alpino-activate.sh > /dev/null if [ "$1" = "" ] then - ds=`date -d -7days +%G-%V` + ds=`date -d -7days +%G.%V` else case "$1" in - 2[0-9][0-9][0-9]-[0-5][0-9]) + 2[0-9][0-9][0-9].[0-5][0-9]) ds=$1 ;; *) @@ -22,11 +25,13 @@ else esac fi -dp=${ds//-//} +year=${ds%.*} +week=${ds#*.} +dp=$year/w$week +corpus=$PART/corpus/$year/$ds +mkdir -p $PART/corpus/$year -corpus=/net/corpora/nlnieuws/Parool/corpus/$ds - -cd /net/corpora/nlnieuws/Parool/$dp +cd $PART/$dp ln -s lock.$$ lock if [ "`readlink lock`" != lock.$$ ] @@ -51,14 +56,14 @@ cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log -../../../metadata +metadata cd xml rm -f $corpus.data.dz $corpus.index alto -q -o $corpus.data.dz *.xml # telling per bericht, niet per zin -/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt +query.sh -x T -s $corpus.data.dz > $corpus.tag.txt cd ../.. rm -fr out diff --git a/RO/cmd/ro/ro.go b/RO/cmd/ro/ro.go index 7d65f88..32a5ec3 100644 --- a/RO/cmd/ro/ro.go +++ b/RO/cmd/ro/ro.go @@ -77,7 +77,7 @@ func main() { } p(err) year, week := t.ISOWeek() - dirname := fmt.Sprintf("/net/corpora/nlnieuws/RO/%d/%02d", year, week) + dirname := fmt.Sprintf("/net/corpora/nlnieuws/RO/%d/w%02d", year, week) if exists(dirname + "/lock") { continue } diff --git a/RO/cmd/xml2txt/xml2txt.go b/RO/cmd/xml2txt/xml2txt.go index 9260b1e..7997986 100644 --- a/RO/cmd/xml2txt/xml2txt.go +++ b/RO/cmd/xml2txt/xml2txt.go @@ -26,7 +26,7 @@ var ( x = e.ExitErr w = e.WarnErr - reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`) + reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`) ) func main() { @@ -38,16 +38,16 @@ func main() { switch len(os.Args) { case 1: year, week := time.Now().AddDate(0, 0, -7).ISOWeek() - ds = fmt.Sprintf("%d-%02d", year, week) + ds = fmt.Sprintf("%d.%02d", year, week) case 2: if !reYearWeek.MatchString(os.Args[1]) { - x(fmt.Errorf("arg must be yyyy-ww")) + x(fmt.Errorf("arg must be yyyy.ww")) } ds = os.Args[1] default: x(fmt.Errorf("too many arguments")) } - dp := ds[:4] + "/" + ds[5:] + dp := ds[:4] + "/w" + ds[5:] x(os.Chdir("/net/corpora/nlnieuws/RO/" + dp)) x(os.MkdirAll("out", 0777)) diff --git a/RO/txt2corpus.sh b/RO/txt2corpus.sh index a895345..e0bc5a2 100755 --- a/RO/txt2corpus.sh +++ b/RO/txt2corpus.sh @@ -2,17 +2,20 @@ set -e +BASE=/net/corpora/nlnieuws +PART=$BASE/RO + unset CDPATH -PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH export TZ=Europe/Amsterdam . /net/aps/etc/alpino-activate.sh > /dev/null if [ "$1" = "" ] then - ds=`date -d -7days +%G-%V` + ds=`date -d -7days +%G.%V` else case "$1" in - 2[0-9][0-9][0-9]-[0-5][0-9]) + 2[0-9][0-9][0-9].[0-5][0-9]) ds=$1 ;; *) @@ -22,11 +25,13 @@ else esac fi -dp=${ds//-//} +year=${ds%.*} +week=${ds#*.} +dp=$year/w$week +corpus=$PART/corpus/$year/$ds +mkdir -p $PART/corpus/$year -corpus=/net/corpora/nlnieuws/RO/corpus/$ds - -cd /net/corpora/nlnieuws/RO/$dp +cd $PART/$dp ln -s lock.$$ lock if [ "`readlink lock`" != lock.$$ ] @@ -38,7 +43,7 @@ fi rm -fr out mkdir out -../../xml2txt $ds +xml2txt $ds rm -f $corpus.lines for i in out/*.txt @@ -53,14 +58,14 @@ cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log -../../../metadata +metadata cd xml rm -f $corpus.data.dz $corpus.index alto -q -o $corpus.data.dz *.xml # telling per bericht, niet per zin -/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt +query.sh -x T -s $corpus.data.dz > $corpus.tag.txt cd ../.. rm -fr out diff --git a/RTVNoord/cmd/rtvnoord/rtvnoord.go b/RTVNoord/cmd/rtvnoord/rtvnoord.go index f16a712..1d8935c 100644 --- a/RTVNoord/cmd/rtvnoord/rtvnoord.go +++ b/RTVNoord/cmd/rtvnoord/rtvnoord.go @@ -101,7 +101,7 @@ func main() { } p(err) year, week := t.ISOWeek() - dirname := fmt.Sprintf("/net/corpora/nlnieuws/RTVNoord/%d/%02d", year, week) + dirname := fmt.Sprintf("/net/corpora/nlnieuws/RTVNoord/%d/w%02d", year, week) if exists(dirname + "/lock") { continue } diff --git a/RTVNoord/txt2corpus.sh b/RTVNoord/txt2corpus.sh index b84fa09..a707955 100755 --- a/RTVNoord/txt2corpus.sh +++ b/RTVNoord/txt2corpus.sh @@ -2,17 +2,20 @@ set -e +BASE=/net/corpora/nlnieuws +PART=$BASE/RTVNoord + unset CDPATH -PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH export TZ=Europe/Amsterdam . /net/aps/etc/alpino-activate.sh > /dev/null if [ "$1" = "" ] then - ds=`date -d -7days +%G-%V` + ds=`date -d -7days +%G.%V` else case "$1" in - 2[0-9][0-9][0-9]-[0-5][0-9]) + 2[0-9][0-9][0-9].[0-5][0-9]) ds=$1 ;; *) @@ -22,11 +25,13 @@ else esac fi -dp=${ds//-//} +year=${ds%.*} +week=${ds#*.} +dp=$year/w$week +corpus=$PART/corpus/$year/$ds +mkdir -p $PART/corpus/$year -corpus=/net/corpora/nlnieuws/RTVNoord/corpus/$ds - -cd /net/corpora/nlnieuws/RTVNoord/$dp +cd $PART/$dp ln -s lock.$$ lock if [ "`readlink lock`" != lock.$$ ] @@ -51,15 +56,15 @@ cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log -../../../metadata +metadata cd xml rm -f $corpus.data.dz $corpus.index alto -q -o $corpus.data.dz *.xml # telling per bericht, niet per zin -/net/corpora/nlnieuws/namen.sh -x C -s $corpus.data.dz > $corpus.cat.txt -/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt +query.sh -x C -s $corpus.data.dz > $corpus.cat.txt +query.sh -x T -s $corpus.data.dz > $corpus.tag.txt cd ../.. rm -fr out diff --git a/Sargasso/cmd/sargasso/sargasso.go b/Sargasso/cmd/sargasso/sargasso.go index c5ceb59..d22b9eb 100644 --- a/Sargasso/cmd/sargasso/sargasso.go +++ b/Sargasso/cmd/sargasso/sargasso.go @@ -77,7 +77,7 @@ func main() { } p(err) year, week := t.ISOWeek() - dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sargasso/%d/%02d", year, week) + dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sargasso/%d/w%02d", year, week) if exists(dirname + "/lock") { continue } diff --git a/Sargasso/cmd/xml2txt/xml2txt.go b/Sargasso/cmd/xml2txt/xml2txt.go index 34274c8..a1d5dcc 100644 --- a/Sargasso/cmd/xml2txt/xml2txt.go +++ b/Sargasso/cmd/xml2txt/xml2txt.go @@ -23,7 +23,7 @@ type Item struct { var ( x = e.ExitErr - reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`) + reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`) ) func main() { @@ -32,16 +32,16 @@ func main() { switch len(os.Args) { case 1: year, week := time.Now().AddDate(0, 0, -7).ISOWeek() - ds = fmt.Sprintf("%d-%02d", year, week) + ds = fmt.Sprintf("%d.%02d", year, week) case 2: if !reYearWeek.MatchString(os.Args[1]) { - x(fmt.Errorf("arg must be yyyy-ww")) + x(fmt.Errorf("arg must be yyyy.ww")) } ds = os.Args[1] default: x(fmt.Errorf("too many arguments")) } - dp := ds[:4] + "/" + ds[5:] + dp := ds[:4] + "/w" + ds[5:] x(os.Chdir("/net/corpora/nlnieuws/Sargasso/" + dp)) x(os.MkdirAll("out", 0777)) diff --git a/Sargasso/txt2corpus.sh b/Sargasso/txt2corpus.sh index 9ef8e8c..37f7d8f 100755 --- a/Sargasso/txt2corpus.sh +++ b/Sargasso/txt2corpus.sh @@ -2,17 +2,20 @@ set -e +BASE=/net/corpora/nlnieuws +PART=$BASE/Sargasso + unset CDPATH -PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH export TZ=Europe/Amsterdam . /net/aps/etc/alpino-activate.sh > /dev/null if [ "$1" = "" ] then - ds=`date -d -7days +%G-%V` + ds=`date -d -7days +%G.%V` else case "$1" in - 2[0-9][0-9][0-9]-[0-5][0-9]) + 2[0-9][0-9][0-9].[0-5][0-9]) ds=$1 ;; *) @@ -22,11 +25,13 @@ else esac fi -dp=${ds//-//} +year=${ds%.*} +week=${ds#*.} +dp=$year/w$week +corpus=$PART/corpus/$year/$ds +mkdir -p $PART/corpus/$year -corpus=/net/corpora/nlnieuws/Sargasso/corpus/$ds - -cd /net/corpora/nlnieuws/Sargasso/$dp +cd $PART/$dp ln -s lock.$$ lock if [ "`readlink lock`" != lock.$$ ] @@ -38,7 +43,7 @@ fi rm -fr out mkdir out -../../xml2txt $ds +xml2txt $ds rm -f $corpus.lines for i in out/*.txt @@ -53,14 +58,14 @@ cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log -../../../metadata +metadata cd xml rm -f $corpus.data.dz $corpus.index alto -q -o $corpus.data.dz *.xml # telling per bericht, niet per zin -/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt +query.sh -x T -s $corpus.data.dz > $corpus.tag.txt cd ../.. rm -fr out diff --git a/Sikkom/cmd/sikkom/sikkom.go b/Sikkom/cmd/sikkom/sikkom.go index 599647a..443c32b 100644 --- a/Sikkom/cmd/sikkom/sikkom.go +++ b/Sikkom/cmd/sikkom/sikkom.go @@ -90,7 +90,7 @@ func main() { } p(err) year, week := t.ISOWeek() - dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sikkom/%d/%02d", year, week) + dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sikkom/%d/w%02d", year, week) if exists(dirname + "/lock") { continue } diff --git a/Sikkom/txt2corpus.sh b/Sikkom/txt2corpus.sh index dc5223c..3627c89 100755 --- a/Sikkom/txt2corpus.sh +++ b/Sikkom/txt2corpus.sh @@ -2,17 +2,20 @@ set -e +BASE=/net/corpora/nlnieuws +PART=$BASE/Sikkom + unset CDPATH -PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH export TZ=Europe/Amsterdam . /net/aps/etc/alpino-activate.sh > /dev/null if [ "$1" = "" ] then - ds=`date -d -7days +%G-%V` + ds=`date -d -7days +%G.%V` else case "$1" in - 2[0-9][0-9][0-9]-[0-5][0-9]) + 2[0-9][0-9][0-9].[0-5][0-9]) ds=$1 ;; *) @@ -22,11 +25,13 @@ else esac fi -dp=${ds//-//} +year=${ds%.*} +week=${ds#*.} +dp=$year/w$week +corpus=$PART/corpus/$year/$ds +mkdir -p $PART/corpus/$year -corpus=/net/corpora/nlnieuws/Sikkom/corpus/$ds - -cd /net/corpora/nlnieuws/Sikkom/$dp +cd $PART/$dp ln -s lock.$$ lock if [ "`readlink lock`" != lock.$$ ] @@ -51,7 +56,7 @@ cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log -../../../metadata +metadata cd xml rm -f $corpus.data.dz $corpus.index diff --git a/Tzum/cmd/tzum/tzum.go b/Tzum/cmd/tzum/tzum.go index f115b44..cad1296 100644 --- a/Tzum/cmd/tzum/tzum.go +++ b/Tzum/cmd/tzum/tzum.go @@ -77,7 +77,7 @@ func main() { } p(err) year, week := t.ISOWeek() - dirname := fmt.Sprintf("/net/corpora/nlnieuws/Tzum/%d/%02d", year, week) + dirname := fmt.Sprintf("/net/corpora/nlnieuws/Tzum/%d/w%02d", year, week) if exists(dirname + "/lock") { continue } diff --git a/Tzum/cmd/xml2txt/xml2txt.go b/Tzum/cmd/xml2txt/xml2txt.go index 0a6d144..0acc04f 100644 --- a/Tzum/cmd/xml2txt/xml2txt.go +++ b/Tzum/cmd/xml2txt/xml2txt.go @@ -23,7 +23,7 @@ type Item struct { var ( x = e.ExitErr - reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`) + reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`) ) func main() { @@ -32,16 +32,16 @@ func main() { switch len(os.Args) { case 1: year, week := time.Now().AddDate(0, 0, -7).ISOWeek() - ds = fmt.Sprintf("%d-%02d", year, week) + ds = fmt.Sprintf("%d.%02d", year, week) case 2: if !reYearWeek.MatchString(os.Args[1]) { - x(fmt.Errorf("arg must be yyyy-ww")) + x(fmt.Errorf("arg must be yyyy.ww")) } ds = os.Args[1] default: x(fmt.Errorf("too many arguments")) } - dp := ds[:4] + "/" + ds[5:] + dp := ds[:4] + "/w" + ds[5:] x(os.Chdir("/net/corpora/nlnieuws/Tzum/" + dp)) x(os.MkdirAll("out", 0777)) diff --git a/Tzum/txt2corpus.sh b/Tzum/txt2corpus.sh index 9cb7a60..18726dc 100755 --- a/Tzum/txt2corpus.sh +++ b/Tzum/txt2corpus.sh @@ -2,17 +2,20 @@ set -e +BASE=/net/corpora/nlnieuws +PART=$BASE/Tzum + unset CDPATH -PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH export TZ=Europe/Amsterdam . /net/aps/etc/alpino-activate.sh > /dev/null if [ "$1" = "" ] then - ds=`date -d -7days +%G-%V` + ds=`date -d -7days +%G.%V` else case "$1" in - 2[0-9][0-9][0-9]-[0-5][0-9]) + 2[0-9][0-9][0-9].[0-5][0-9]) ds=$1 ;; *) @@ -22,11 +25,13 @@ else esac fi -dp=${ds//-//} +year=${ds%.*} +week=${ds#*.} +dp=$year/w$week +corpus=$PART/corpus/$year/$ds +mkdir -p $PART/corpus/$year -corpus=/net/corpora/nlnieuws/Tzum/corpus/$ds - -cd /net/corpora/nlnieuws/Tzum/$dp +cd $PART/$dp ln -s lock.$$ lock if [ "`readlink lock`" != lock.$$ ] @@ -38,7 +43,7 @@ fi rm -fr out mkdir out -../../xml2txt $ds +xml2txt $ds rm -f $corpus.lines for i in out/*.txt @@ -53,14 +58,14 @@ cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log -../../../metadata +metadata cd xml rm -f $corpus.data.dz $corpus.index alto -q -o $corpus.data.dz *.xml # telling per bericht, niet per zin -/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt +query.sh -x T -s $corpus.data.dz > $corpus.tag.txt cd ../.. rm -fr out diff --git a/VRT/cmd/vrt/vrt.go b/VRT/cmd/vrt/vrt.go index 258c36c..09e8e1a 100644 --- a/VRT/cmd/vrt/vrt.go +++ b/VRT/cmd/vrt/vrt.go @@ -110,8 +110,7 @@ func main() { if t2.After(t) { t = t2 } - year, week := t.ISOWeek() - dirname := fmt.Sprintf("/net/corpora/nlnieuws/VRT/%d/%02d", year, week) + dirname := fmt.Sprintf("/net/corpora/nlnieuws/VRT/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day()) if exists(dirname + "/lock") { continue } diff --git a/VRT/txt2corpus.sh b/VRT/txt2corpus.sh index fc6c9c9..9d6158d 100755 --- a/VRT/txt2corpus.sh +++ b/VRT/txt2corpus.sh @@ -2,17 +2,20 @@ set -e +BASE=/net/corpora/nlnieuws +PART=$BASE/VRT + unset CDPATH -PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH +PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH export TZ=Europe/Amsterdam . /net/aps/etc/alpino-activate.sh > /dev/null if [ "$1" = "" ] then - ds=`date -d -7days +%G-%V` + ds=`date -d -2days +%Y-%m-%d` else case "$1" in - 2[0-9][0-9][0-9]-[0-5][0-9]) + 2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9]) ds=$1 ;; *) @@ -23,10 +26,11 @@ else fi dp=${ds//-//} +year=${ds%%-*} +corpus=$PART/corpus/$year/$ds +mkdir -p $PART/corpus/$year -corpus=/net/corpora/nlnieuws/VRT/corpus/$ds - -cd /net/corpora/nlnieuws/VRT/$dp +cd $PART/$dp ln -s lock.$$ lock if [ "`readlink lock`" != lock.$$ ] @@ -51,15 +55,15 @@ cd out mkdir xml Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log -../../../metadata +metadata cd xml rm -f $corpus.data.dz $corpus.index alto -q -o $corpus.data.dz *.xml # telling per bericht, niet per zin -/net/corpora/nlnieuws/namen.sh -x C -s $corpus.data.dz > $corpus.cat.txt -/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt +query.sh -x C -s $corpus.data.dz > $corpus.cat.txt +query.sh -x T -s $corpus.data.dz > $corpus.tag.txt cd ../.. rm -fr out diff --git a/cmd/data2json/data2json.go b/cmd/data2json/data2json.go index 07a3a6d..e6765cc 100644 --- a/cmd/data2json/data2json.go +++ b/cmd/data2json/data2json.go @@ -70,9 +70,9 @@ var ( func main() { - aa := strings.Split(os.Args[1], "-") + aa := strings.Split(os.Args[1], ".") if len(aa) != 2 { - x(fmt.Errorf("ongeldig argument, moet in formaat yyyy-dd zijn")) + x(fmt.Errorf("ongeldig argument, moet in formaat yyyy.dd zijn")) } var err error @@ -128,7 +128,8 @@ func makeParts(source string) *Parts { func makeValues(source, part string) [][5]any { v := make([][5]any, 0) - filename := fmt.Sprintf("/net/corpora/nlnieuws/data/%s-%s-%d-%02d-%d%s", + filename := fmt.Sprintf("/net/corpora/nlnieuws/data/%d/%s-%s-%d-%02d-%d%s", + year, sources[source], parts[part].file, year, @@ -226,7 +227,7 @@ func dates() (start, first, last string, names []string) { t3 := tStart for range size { y, w := t3.ISOWeek() - names = append(names, fmt.Sprintf("%d/%02d", y, w)) + names = append(names, fmt.Sprintf("%d/w%02d", y, w)) t3 = t3.AddDate(0, 0, 7) } t3 = tStart diff --git a/cmd/dates2json/dates2json.go b/cmd/dates2json/dates2json.go index 11d00fd..f5a5a9a 100644 --- a/cmd/dates2json/dates2json.go +++ b/cmd/dates2json/dates2json.go @@ -26,13 +26,24 @@ var ( func main() { - files, err := os.ReadDir("/net/corpora/nlnieuws/data/json") + dirs, err := os.ReadDir("/net/corpora/nlnieuws/data/json") x(err) + for _, dir := range dirs { + if !dir.IsDir() { + continue + } + dirname := dir.Name() + if dirname[0] != '2' { + continue + } + files, err := os.ReadDir("/net/corpora/nlnieuws/data/json/" + dirname) + x(err) - for _, file := range files { - filename := file.Name() - if strings.HasPrefix(filename, "DATA-") && strings.HasSuffix(filename, "-4.json") { - addWeek(filename[5:12]) + for _, file := range files { + filename := file.Name() + if strings.HasPrefix(filename, "DATA-") && strings.HasSuffix(filename, "-4.json") { + addWeek(filename[5:12]) + } } } diff --git a/cmd/week2files/week2files.go b/cmd/week2files/week2files.go index b0ba576..37aa3d7 100644 --- a/cmd/week2files/week2files.go +++ b/cmd/week2files/week2files.go @@ -15,9 +15,9 @@ var ( ) func main() { - aa := strings.Split(os.Args[1], "-") + aa := strings.Split(os.Args[1], ".") if len(aa) != 2 { - x(fmt.Errorf("ongeldig argument, moet in formaat yyyy-dd zijn")) + x(fmt.Errorf("ongeldig argument, moet in formaat yyyy.dd zijn")) } year, err := strconv.Atoi(aa[0]) @@ -58,7 +58,7 @@ func main() { fmt.Print(" -or") } y, w := t2.ISOWeek() - fmt.Printf(" -name %d-%02d.data.dz", y, w) + fmt.Printf(" -name %d.%02d.data.dz", y, w) t2 = t2.AddDate(0, 0, 7) } diff --git a/collect.sh b/collect.sh index 3f88440..43b5ce8 100755 --- a/collect.sh +++ b/collect.sh @@ -22,10 +22,10 @@ say () { if [ "$1" = "" ] then - ds=`date -d -7days +%G-%V` + ds=`date -d -7days +%G.%V` else case "$1" in - 2[0-9][0-9][0-9]-[0-5][0-9]) + 2[0-9][0-9][0-9].[0-5][0-9]) ds=$1 ;; *) @@ -35,7 +35,11 @@ else esac fi -cd /net/corpora/nlnieuws/data +year=${ds%%.*} + +mkdir -p /net/corpora/nlnieuws/data/$year +mkdir -p /net/corpora/nlnieuws/data/json/$year +cd /net/corpora/nlnieuws/data/$year declare -A parts #parts[alles]='.' @@ -68,7 +72,7 @@ do for i in 1 4 do - files=$(find .. $(week2files $ds $i) | grep -E "$regex") || true + files=$(find ../.. $(week2files $ds $i) | grep -E "$regex") || true if [ -z "$files" ] then continue @@ -161,10 +165,10 @@ do done done -data2json $ds 1 > json/DATA-$ds-1.json -data2json $ds 4 > json/DATA-$ds-4.json -dates2json > json/index1.json -dates2json > json/index4.json +data2json $ds 1 > ../json/$year/DATA-$ds-1.json +data2json $ds 4 > ../json/$year/DATA-$ds-4.json +dates2json > ../json/index1.json +dates2json > ../json/index4.json # rechten bijwerken chmod -R g+w /net/corpora/nlnieuws diff --git a/internal/util/util.go b/internal/util/util.go index ab7ea8e..98623b2 100644 --- a/internal/util/util.go +++ b/internal/util/util.go @@ -17,8 +17,11 @@ var ( func AddEnd(s string) string { s = strings.TrimSpace(s) - if s == "" || reEOL.MatchString(s) { - return s + if s == "" { + return "" + } + if reEOL.MatchString(s) { + return s + "\n" } return s + ".\n" } diff --git a/www/app.js b/www/app.js index da033ca..b42c9da 100644 --- a/www/app.js +++ b/www/app.js @@ -133,7 +133,9 @@ function makeTD(title, values) { async function loadSource(source, week) { if (!data.has(week)) { - data[week] = await getJSON('DATA-' + week + '-4.json') + data[week] = await getJSON( + week.substring(0, 4) + '/DATA-' + week + '-4.json', + ) } idSubtitle.innerHTML = source + ' — t/m ' + data[week].last @@ -154,7 +156,9 @@ async function loadSource(source, week) { async function loadPart(part, week) { if (!data.has(week)) { - data[week] = await getJSON('DATA-' + week + '-4.json') + data[week] = await getJSON( + week.substring(0, 4) + '/DATA-' + week + '-4.json', + ) } idSubtitle.innerHTML = part + ' — t/m ' + data[week].last @@ -185,7 +189,9 @@ async function loadWeken(source, part) { if (i < dates.length) { var week = dates[i].week if (!data.has(week)) { - data[week] = await getJSON('DATA-' + week + '-4.json') + data[week] = await getJSON( + week.substring(0, 4) + '/DATA-' + week + '-4.json', + ) } var values = data[week][source][part] tr.appendChild(makeTD('t/m ' + data[week].last, values))