grote reorganisatie:

- HLN, NOS, NU, VRT: per week -> per dag
- yyyy-ww -> yyyy.ww
- yyyy*  -> yyyy/yyyy*
etc
This commit is contained in:
Peter Kleiweg
2026-05-27 22:42:03 +02:00
parent e430ff576b
commit 5c651387af
46 changed files with 328 additions and 227 deletions

View File

@@ -77,7 +77,7 @@ func main() {
} }
p(err) p(err)
year, week := t.ISOWeek() year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/AT5/%d/%02d", year, week) dirname := fmt.Sprintf("/net/corpora/nlnieuws/AT5/%d/w%02d", year, week)
if exists(dirname + "/lock") { if exists(dirname + "/lock") {
continue continue
} }

View File

@@ -22,7 +22,7 @@ type Item struct {
var ( var (
x = e.ExitErr x = e.ExitErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`) reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
) )
func main() { func main() {
@@ -31,16 +31,16 @@ func main() {
switch len(os.Args) { switch len(os.Args) {
case 1: case 1:
year, week := time.Now().AddDate(0, 0, -7).ISOWeek() year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
ds = fmt.Sprintf("%d-%02d", year, week) ds = fmt.Sprintf("%d.%02d", year, week)
case 2: case 2:
if !reYearWeek.MatchString(os.Args[1]) { if !reYearWeek.MatchString(os.Args[1]) {
x(fmt.Errorf("arg must be yyyy-ww")) x(fmt.Errorf("arg must be yyyy.ww"))
} }
ds = os.Args[1] ds = os.Args[1]
default: default:
x(fmt.Errorf("too many arguments")) x(fmt.Errorf("too many arguments"))
} }
dp := ds[:4] + "/" + ds[5:] dp := ds[:4] + "/w" + ds[5:]
x(os.Chdir("/net/corpora/nlnieuws/AT5/" + dp)) x(os.Chdir("/net/corpora/nlnieuws/AT5/" + dp))
x(os.MkdirAll("out", 0777)) x(os.MkdirAll("out", 0777))

View File

@@ -2,17 +2,20 @@
set -e set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/AT5
unset CDPATH unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null . /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ] if [ "$1" = "" ]
then then
ds=`date -d -7days +%G-%V` ds=`date -d -7days +%G.%V`
else else
case "$1" in case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9]) 2[0-9][0-9][0-9].[0-5][0-9])
ds=$1 ds=$1
;; ;;
*) *)
@@ -22,11 +25,13 @@ else
esac esac
fi fi
dp=${ds//-//} year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/AT5/corpus/$ds cd $PART/$dp
cd /net/corpora/nlnieuws/AT5/$dp
ln -s lock.$$ lock ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ] if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +43,7 @@ fi
rm -fr out rm -fr out
mkdir out mkdir out
../../xml2txt $ds xml2txt $ds
rm -f $corpus.lines rm -f $corpus.lines
for i in out/*.txt for i in out/*.txt
@@ -53,7 +58,7 @@ cd out
mkdir xml mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata metadata
cd xml cd xml
rm -f $corpus.data.dz $corpus.index rm -f $corpus.data.dz $corpus.index

View File

@@ -95,7 +95,7 @@ func main() {
} }
p(err) p(err)
year, week := t.ISOWeek() year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtAdam/%d/%02d", year, week) dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtAdam/%d/w%02d", year, week)
if exists(dirname + "/lock") { if exists(dirname + "/lock") {
continue continue
} }

View File

@@ -2,17 +2,20 @@
set -e set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/BuurtAdam
unset CDPATH unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null . /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ] if [ "$1" = "" ]
then then
ds=`date -d -7days +%G-%V` ds=`date -d -7days +%G.%V`
else else
case "$1" in case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9]) 2[0-9][0-9][0-9].[0-5][0-9])
ds=$1 ds=$1
;; ;;
*) *)
@@ -22,11 +25,13 @@ else
esac esac
fi fi
dp=${ds//-//} year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/BuurtAdam/corpus/$ds cd $PART/$dp
cd /net/corpora/nlnieuws/BuurtAdam/$dp
ln -s lock.$$ lock ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ] if [ "`readlink lock`" != lock.$$ ]
@@ -51,14 +56,14 @@ cd out
mkdir xml mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata metadata
cd xml cd xml
rm -f $corpus.data.dz $corpus.index rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin # telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../.. cd ../..
rm -fr out rm -fr out

View File

@@ -94,7 +94,7 @@ func main() {
} }
p(err) p(err)
year, week := t.ISOWeek() year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtGrn/%d/%02d", year, week) dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtGrn/%d/w%02d", year, week)
if exists(dirname + "/lock") { if exists(dirname + "/lock") {
continue continue
} }

View File

@@ -2,17 +2,20 @@
set -e set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/BuurtGrn
unset CDPATH unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null . /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ] if [ "$1" = "" ]
then then
ds=`date -d -7days +%G-%V` ds=`date -d -7days +%G.%V`
else else
case "$1" in case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9]) 2[0-9][0-9][0-9].[0-5][0-9])
ds=$1 ds=$1
;; ;;
*) *)
@@ -22,11 +25,13 @@ else
esac esac
fi fi
dp=${ds//-//} year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/BuurtGrn/corpus/$ds cd $PART/$dp
cd /net/corpora/nlnieuws/BuurtGrn/$dp
ln -s lock.$$ lock ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ] if [ "`readlink lock`" != lock.$$ ]
@@ -51,14 +56,14 @@ cd out
mkdir xml mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata metadata
cd xml cd xml
rm -f $corpus.data.dz $corpus.index rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin # telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../.. cd ../..
rm -fr out rm -fr out

View File

@@ -95,7 +95,7 @@ func main() {
} }
p(err) p(err)
year, week := t.ISOWeek() year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/GG/%d/%02d", year, week) dirname := fmt.Sprintf("/net/corpora/nlnieuws/GG/%d/w%02d", year, week)
if exists(dirname + "/lock") { if exists(dirname + "/lock") {
continue continue
} }

View File

@@ -2,17 +2,20 @@
set -e set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/GG
unset CDPATH unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null . /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ] if [ "$1" = "" ]
then then
ds=`date -d -7days +%G-%V` ds=`date -d -7days +%G.%V`
else else
case "$1" in case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9]) 2[0-9][0-9][0-9].[0-5][0-9])
ds=$1 ds=$1
;; ;;
*) *)
@@ -22,11 +25,13 @@ else
esac esac
fi fi
dp=${ds//-//} year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/GG/corpus/$ds cd $PART/$dp
cd /net/corpora/nlnieuws/GG/$dp
ln -s lock.$$ lock ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ] if [ "`readlink lock`" != lock.$$ ]
@@ -51,7 +56,7 @@ cd out
mkdir xml mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata metadata
cd xml cd xml
rm -f $corpus.data.dz $corpus.index rm -f $corpus.data.dz $corpus.index

View File

@@ -101,8 +101,7 @@ func main() {
t, err = time.Parse(time.RFC1123, item.PubDate) t, err = time.Parse(time.RFC1123, item.PubDate)
} }
p(err) p(err)
year, week := t.ISOWeek() dirname := fmt.Sprintf("/net/corpora/nlnieuws/HLN/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
dirname := fmt.Sprintf("/net/corpora/nlnieuws/HLN/%d/%02d", year, week)
if exists(dirname + "/lock") { if exists(dirname + "/lock") {
continue continue
} }

View File

@@ -2,17 +2,20 @@
set -e set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/HLN
unset CDPATH unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null . /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ] if [ "$1" = "" ]
then then
ds=`date -d -7days +%G-%V` ds=`date -d -2days +%Y-%m-%d`
else else
case "$1" in case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9]) 2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
ds=$1 ds=$1
;; ;;
*) *)
@@ -23,10 +26,11 @@ else
fi fi
dp=${ds//-//} dp=${ds//-//}
year=${ds%%-*}
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/HLN/corpus/$ds cd $PART/$dp
cd /net/corpora/nlnieuws/HLN/$dp
ln -s lock.$$ lock ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ] if [ "`readlink lock`" != lock.$$ ]
@@ -51,14 +55,14 @@ cd out
mkdir xml mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata metadata
cd xml cd xml
rm -f $corpus.data.dz $corpus.index rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin # telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../.. cd ../..
rm -fr out rm -fr out

View File

@@ -77,7 +77,7 @@ func main() {
} }
p(err) p(err)
year, week := t.ISOWeek() year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/LitNL/%d/%02d", year, week) dirname := fmt.Sprintf("/net/corpora/nlnieuws/LitNL/%d/w%02d", year, week)
if exists(dirname + "/lock") { if exists(dirname + "/lock") {
continue continue
} }

View File

@@ -24,7 +24,7 @@ var (
w = e.WarnErr w = e.WarnErr
x = e.ExitErr x = e.ExitErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`) reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
) )
func main() { func main() {
@@ -33,16 +33,16 @@ func main() {
switch len(os.Args) { switch len(os.Args) {
case 1: case 1:
year, week := time.Now().AddDate(0, 0, -7).ISOWeek() year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
ds = fmt.Sprintf("%d-%02d", year, week) ds = fmt.Sprintf("%d.%02d", year, week)
case 2: case 2:
if !reYearWeek.MatchString(os.Args[1]) { if !reYearWeek.MatchString(os.Args[1]) {
x(fmt.Errorf("arg must be yyyy-ww")) x(fmt.Errorf("arg must be yyyy.ww"))
} }
ds = os.Args[1] ds = os.Args[1]
default: default:
x(fmt.Errorf("too many arguments")) x(fmt.Errorf("too many arguments"))
} }
dp := ds[:4] + "/" + ds[5:] dp := ds[:4] + "/w" + ds[5:]
x(os.Chdir("/net/corpora/nlnieuws/LitNL/" + dp)) x(os.Chdir("/net/corpora/nlnieuws/LitNL/" + dp))
x(os.MkdirAll("out", 0777)) x(os.MkdirAll("out", 0777))

View File

@@ -2,17 +2,20 @@
set -e set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/LitNL
unset CDPATH unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null . /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ] if [ "$1" = "" ]
then then
ds=`date -d -7days +%G-%V` ds=`date -d -7days +%G.%V`
else else
case "$1" in case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9]) 2[0-9][0-9][0-9].[0-5][0-9])
ds=$1 ds=$1
;; ;;
*) *)
@@ -22,11 +25,13 @@ else
esac esac
fi fi
dp=${ds//-//} year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/LitNL/corpus/$ds cd $PART/$dp
cd /net/corpora/nlnieuws/LitNL/$dp
ln -s lock.$$ lock ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ] if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +43,7 @@ fi
rm -fr out rm -fr out
mkdir out mkdir out
../../xml2txt $ds xml2txt $ds
rm -f $corpus.lines rm -f $corpus.lines
for i in out/*.txt for i in out/*.txt
@@ -53,14 +58,14 @@ cd out
mkdir xml mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata metadata
cd xml cd xml
rm -f $corpus.data.dz $corpus.index rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin # telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../.. cd ../..
rm -fr out rm -fr out

View File

@@ -28,7 +28,7 @@ type Item struct {
var ( var (
x = e.ExitErr x = e.ExitErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`) reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9]$`)
) )
func main() { func main() {
@@ -36,17 +36,17 @@ func main() {
var ds string var ds string
switch len(os.Args) { switch len(os.Args) {
case 1: case 1:
year, week := time.Now().AddDate(0, 0, -7).ISOWeek() t := time.Now().AddDate(0, 0, -2)
ds = fmt.Sprintf("%d-%02d", year, week) ds = fmt.Sprintf("%d-%02d-%02d", t.Year(), int(t.Month()), t.Day())
case 2: case 2:
if !reYearWeek.MatchString(os.Args[1]) { if !reYearWeek.MatchString(os.Args[1]) {
x(fmt.Errorf("arg must be yyyy-ww")) x(fmt.Errorf("arg must be yyyy-mm-dd"))
} }
ds = os.Args[1] ds = os.Args[1]
default: default:
x(fmt.Errorf("too many arguments")) x(fmt.Errorf("too many arguments"))
} }
dp := ds[:4] + "/" + ds[5:] dp := strings.ReplaceAll(ds, "-", "/")
x(os.Chdir("/net/corpora/nlnieuws/NOS/" + dp)) x(os.Chdir("/net/corpora/nlnieuws/NOS/" + dp))
x(os.MkdirAll("out", 0777)) x(os.MkdirAll("out", 0777))

View File

@@ -94,8 +94,7 @@ func main() {
} }
} }
p(err) p(err)
year, week := t.ISOWeek() dirname := fmt.Sprintf("/net/corpora/nlnieuws/NOS/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NOS/%d/%02d", year, week)
if exists(dirname + "/lock") { if exists(dirname + "/lock") {
continue continue
} }

View File

@@ -2,17 +2,20 @@
set -e set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/NOS
unset CDPATH unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null . /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ] if [ "$1" = "" ]
then then
ds=`date -d -7days +%G-%V` ds=`date -d -2days +%Y-%m-%d`
else else
case "$1" in case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9]) 2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
ds=$1 ds=$1
;; ;;
*) *)
@@ -23,10 +26,11 @@ else
fi fi
dp=${ds//-//} dp=${ds//-//}
year=${ds%%-*}
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/NOS/corpus/$ds cd $PART/$dp
cd /net/corpora/nlnieuws/NOS/$dp
ln -s lock.$$ lock ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ] if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +42,7 @@ fi
rm -fr out rm -fr out
mkdir out mkdir out
../../json2txt $ds json2txt $ds
rm -f $corpus.lines rm -f $corpus.lines
for i in out/*.txt for i in out/*.txt
@@ -53,15 +57,15 @@ cd out
mkdir xml mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata metadata
cd xml cd xml
rm -f $corpus.data.dz $corpus.index rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin # telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x C -s $corpus.data.dz > $corpus.cat.txt query.sh -x C -s $corpus.data.dz > $corpus.cat.txt
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../.. cd ../..
rm -fr out rm -fr out

View File

@@ -98,8 +98,7 @@ func main() {
t, err = time.Parse(time.RFC1123, item.PubDate) t, err = time.Parse(time.RFC1123, item.PubDate)
} }
p(err) p(err)
year, week := t.ISOWeek() dirname := fmt.Sprintf("/net/corpora/nlnieuws/NU/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NU/%d/%02d", year, week)
if exists(dirname + "/lock") { if exists(dirname + "/lock") {
continue continue
} }

View File

@@ -2,17 +2,20 @@
set -e set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/NU
unset CDPATH unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null . /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ] if [ "$1" = "" ]
then then
ds=`date -d -7days +%G-%V` ds=`date -d -2days +%Y-%m-%d`
else else
case "$1" in case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9]) 2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
ds=$1 ds=$1
;; ;;
*) *)
@@ -23,10 +26,11 @@ else
fi fi
dp=${ds//-//} dp=${ds//-//}
year=${ds%%-*}
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/NU/corpus/$ds cd $PART/$dp
cd /net/corpora/nlnieuws/NU/$dp
ln -s lock.$$ lock ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ] if [ "`readlink lock`" != lock.$$ ]
@@ -51,14 +55,14 @@ cd out
mkdir xml mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata metadata
cd xml cd xml
rm -f $corpus.data.dz $corpus.index rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin # telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../.. cd ../..
rm -fr out rm -fr out

View File

@@ -2,8 +2,11 @@
set -e set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/NieuwsNL
unset CDPATH unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null . /net/aps/etc/alpino-activate.sh > /dev/null
@@ -25,10 +28,11 @@ else
fi fi
dp=${ds//-//} dp=${ds//-//}
year=${ds%%-*}
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/NieuwsNL/corpus/$ds cd $PART/$dp
cd /net/corpora/nlnieuws/NieuwsNL/$dp
ln -s lock.$$ lock ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ] if [ "`readlink lock`" != lock.$$ ]
@@ -53,14 +57,14 @@ cd out
mkdir xml mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../../metadata metadata
cd xml cd xml
rm -f $corpus.data.dz $corpus.index rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin # telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../.. cd ../..
rm -fr out rm -fr out

View File

@@ -77,7 +77,7 @@ func main() {
} }
p(err) p(err)
year, week := t.ISOWeek() year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Oog/%d/%02d", year, week) dirname := fmt.Sprintf("/net/corpora/nlnieuws/Oog/%d/w%02d", year, week)
if exists(dirname + "/lock") { if exists(dirname + "/lock") {
continue continue
} }

View File

@@ -23,7 +23,7 @@ type Item struct {
var ( var (
x = e.ExitErr x = e.ExitErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`) reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
) )
func main() { func main() {
@@ -32,16 +32,16 @@ func main() {
switch len(os.Args) { switch len(os.Args) {
case 1: case 1:
year, week := time.Now().AddDate(0, 0, -7).ISOWeek() year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
ds = fmt.Sprintf("%d-%02d", year, week) ds = fmt.Sprintf("%d.%02d", year, week)
case 2: case 2:
if !reYearWeek.MatchString(os.Args[1]) { if !reYearWeek.MatchString(os.Args[1]) {
x(fmt.Errorf("arg must be yyyy-ww")) x(fmt.Errorf("arg must be yyyy.ww"))
} }
ds = os.Args[1] ds = os.Args[1]
default: default:
x(fmt.Errorf("too many arguments")) x(fmt.Errorf("too many arguments"))
} }
dp := ds[:4] + "/" + ds[5:] dp := ds[:4] + "/w" + ds[5:]
x(os.Chdir("/net/corpora/nlnieuws/Oog/" + dp)) x(os.Chdir("/net/corpora/nlnieuws/Oog/" + dp))
x(os.MkdirAll("out", 0777)) x(os.MkdirAll("out", 0777))

View File

@@ -2,17 +2,20 @@
set -e set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/Oog
unset CDPATH unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null . /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ] if [ "$1" = "" ]
then then
ds=`date -d -7days +%G-%V` ds=`date -d -7days +%G.%V`
else else
case "$1" in case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9]) 2[0-9][0-9][0-9].[0-5][0-9])
ds=$1 ds=$1
;; ;;
*) *)
@@ -22,11 +25,13 @@ else
esac esac
fi fi
dp=${ds//-//} year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/Oog/corpus/$ds cd $PART/$dp
cd /net/corpora/nlnieuws/Oog/$dp
ln -s lock.$$ lock ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ] if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +43,7 @@ fi
rm -fr out rm -fr out
mkdir out mkdir out
../../xml2txt $ds xml2txt $ds
rm -f $corpus.lines rm -f $corpus.lines
for i in out/*.txt for i in out/*.txt
@@ -53,14 +58,14 @@ cd out
mkdir xml mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata metadata
cd xml cd xml
rm -f $corpus.data.dz $corpus.index rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin # telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../.. cd ../..
rm -fr out rm -fr out

View File

@@ -102,7 +102,7 @@ func main() {
} }
p(err) p(err)
year, week := t.ISOWeek() year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Parool/%d/%02d", year, week) dirname := fmt.Sprintf("/net/corpora/nlnieuws/Parool/%d/w%02d", year, week)
if exists(dirname + "/lock") { if exists(dirname + "/lock") {
continue continue
} }

View File

@@ -2,17 +2,20 @@
set -e set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/Parool
unset CDPATH unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null . /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ] if [ "$1" = "" ]
then then
ds=`date -d -7days +%G-%V` ds=`date -d -7days +%G.%V`
else else
case "$1" in case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9]) 2[0-9][0-9][0-9].[0-5][0-9])
ds=$1 ds=$1
;; ;;
*) *)
@@ -22,11 +25,13 @@ else
esac esac
fi fi
dp=${ds//-//} year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/Parool/corpus/$ds cd $PART/$dp
cd /net/corpora/nlnieuws/Parool/$dp
ln -s lock.$$ lock ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ] if [ "`readlink lock`" != lock.$$ ]
@@ -51,14 +56,14 @@ cd out
mkdir xml mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata metadata
cd xml cd xml
rm -f $corpus.data.dz $corpus.index rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin # telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../.. cd ../..
rm -fr out rm -fr out

View File

@@ -77,7 +77,7 @@ func main() {
} }
p(err) p(err)
year, week := t.ISOWeek() year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/RO/%d/%02d", year, week) dirname := fmt.Sprintf("/net/corpora/nlnieuws/RO/%d/w%02d", year, week)
if exists(dirname + "/lock") { if exists(dirname + "/lock") {
continue continue
} }

View File

@@ -26,7 +26,7 @@ var (
x = e.ExitErr x = e.ExitErr
w = e.WarnErr w = e.WarnErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`) reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
) )
func main() { func main() {
@@ -38,16 +38,16 @@ func main() {
switch len(os.Args) { switch len(os.Args) {
case 1: case 1:
year, week := time.Now().AddDate(0, 0, -7).ISOWeek() year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
ds = fmt.Sprintf("%d-%02d", year, week) ds = fmt.Sprintf("%d.%02d", year, week)
case 2: case 2:
if !reYearWeek.MatchString(os.Args[1]) { if !reYearWeek.MatchString(os.Args[1]) {
x(fmt.Errorf("arg must be yyyy-ww")) x(fmt.Errorf("arg must be yyyy.ww"))
} }
ds = os.Args[1] ds = os.Args[1]
default: default:
x(fmt.Errorf("too many arguments")) x(fmt.Errorf("too many arguments"))
} }
dp := ds[:4] + "/" + ds[5:] dp := ds[:4] + "/w" + ds[5:]
x(os.Chdir("/net/corpora/nlnieuws/RO/" + dp)) x(os.Chdir("/net/corpora/nlnieuws/RO/" + dp))
x(os.MkdirAll("out", 0777)) x(os.MkdirAll("out", 0777))

View File

@@ -2,17 +2,20 @@
set -e set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/RO
unset CDPATH unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null . /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ] if [ "$1" = "" ]
then then
ds=`date -d -7days +%G-%V` ds=`date -d -7days +%G.%V`
else else
case "$1" in case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9]) 2[0-9][0-9][0-9].[0-5][0-9])
ds=$1 ds=$1
;; ;;
*) *)
@@ -22,11 +25,13 @@ else
esac esac
fi fi
dp=${ds//-//} year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/RO/corpus/$ds cd $PART/$dp
cd /net/corpora/nlnieuws/RO/$dp
ln -s lock.$$ lock ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ] if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +43,7 @@ fi
rm -fr out rm -fr out
mkdir out mkdir out
../../xml2txt $ds xml2txt $ds
rm -f $corpus.lines rm -f $corpus.lines
for i in out/*.txt for i in out/*.txt
@@ -53,14 +58,14 @@ cd out
mkdir xml mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata metadata
cd xml cd xml
rm -f $corpus.data.dz $corpus.index rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin # telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../.. cd ../..
rm -fr out rm -fr out

View File

@@ -101,7 +101,7 @@ func main() {
} }
p(err) p(err)
year, week := t.ISOWeek() year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/RTVNoord/%d/%02d", year, week) dirname := fmt.Sprintf("/net/corpora/nlnieuws/RTVNoord/%d/w%02d", year, week)
if exists(dirname + "/lock") { if exists(dirname + "/lock") {
continue continue
} }

View File

@@ -2,17 +2,20 @@
set -e set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/RTVNoord
unset CDPATH unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null . /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ] if [ "$1" = "" ]
then then
ds=`date -d -7days +%G-%V` ds=`date -d -7days +%G.%V`
else else
case "$1" in case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9]) 2[0-9][0-9][0-9].[0-5][0-9])
ds=$1 ds=$1
;; ;;
*) *)
@@ -22,11 +25,13 @@ else
esac esac
fi fi
dp=${ds//-//} year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/RTVNoord/corpus/$ds cd $PART/$dp
cd /net/corpora/nlnieuws/RTVNoord/$dp
ln -s lock.$$ lock ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ] if [ "`readlink lock`" != lock.$$ ]
@@ -51,15 +56,15 @@ cd out
mkdir xml mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata metadata
cd xml cd xml
rm -f $corpus.data.dz $corpus.index rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin # telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x C -s $corpus.data.dz > $corpus.cat.txt query.sh -x C -s $corpus.data.dz > $corpus.cat.txt
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../.. cd ../..
rm -fr out rm -fr out

View File

@@ -77,7 +77,7 @@ func main() {
} }
p(err) p(err)
year, week := t.ISOWeek() year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sargasso/%d/%02d", year, week) dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sargasso/%d/w%02d", year, week)
if exists(dirname + "/lock") { if exists(dirname + "/lock") {
continue continue
} }

View File

@@ -23,7 +23,7 @@ type Item struct {
var ( var (
x = e.ExitErr x = e.ExitErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`) reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
) )
func main() { func main() {
@@ -32,16 +32,16 @@ func main() {
switch len(os.Args) { switch len(os.Args) {
case 1: case 1:
year, week := time.Now().AddDate(0, 0, -7).ISOWeek() year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
ds = fmt.Sprintf("%d-%02d", year, week) ds = fmt.Sprintf("%d.%02d", year, week)
case 2: case 2:
if !reYearWeek.MatchString(os.Args[1]) { if !reYearWeek.MatchString(os.Args[1]) {
x(fmt.Errorf("arg must be yyyy-ww")) x(fmt.Errorf("arg must be yyyy.ww"))
} }
ds = os.Args[1] ds = os.Args[1]
default: default:
x(fmt.Errorf("too many arguments")) x(fmt.Errorf("too many arguments"))
} }
dp := ds[:4] + "/" + ds[5:] dp := ds[:4] + "/w" + ds[5:]
x(os.Chdir("/net/corpora/nlnieuws/Sargasso/" + dp)) x(os.Chdir("/net/corpora/nlnieuws/Sargasso/" + dp))
x(os.MkdirAll("out", 0777)) x(os.MkdirAll("out", 0777))

View File

@@ -2,17 +2,20 @@
set -e set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/Sargasso
unset CDPATH unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null . /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ] if [ "$1" = "" ]
then then
ds=`date -d -7days +%G-%V` ds=`date -d -7days +%G.%V`
else else
case "$1" in case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9]) 2[0-9][0-9][0-9].[0-5][0-9])
ds=$1 ds=$1
;; ;;
*) *)
@@ -22,11 +25,13 @@ else
esac esac
fi fi
dp=${ds//-//} year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/Sargasso/corpus/$ds cd $PART/$dp
cd /net/corpora/nlnieuws/Sargasso/$dp
ln -s lock.$$ lock ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ] if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +43,7 @@ fi
rm -fr out rm -fr out
mkdir out mkdir out
../../xml2txt $ds xml2txt $ds
rm -f $corpus.lines rm -f $corpus.lines
for i in out/*.txt for i in out/*.txt
@@ -53,14 +58,14 @@ cd out
mkdir xml mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata metadata
cd xml cd xml
rm -f $corpus.data.dz $corpus.index rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin # telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../.. cd ../..
rm -fr out rm -fr out

View File

@@ -90,7 +90,7 @@ func main() {
} }
p(err) p(err)
year, week := t.ISOWeek() year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sikkom/%d/%02d", year, week) dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sikkom/%d/w%02d", year, week)
if exists(dirname + "/lock") { if exists(dirname + "/lock") {
continue continue
} }

View File

@@ -2,17 +2,20 @@
set -e set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/Sikkom
unset CDPATH unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null . /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ] if [ "$1" = "" ]
then then
ds=`date -d -7days +%G-%V` ds=`date -d -7days +%G.%V`
else else
case "$1" in case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9]) 2[0-9][0-9][0-9].[0-5][0-9])
ds=$1 ds=$1
;; ;;
*) *)
@@ -22,11 +25,13 @@ else
esac esac
fi fi
dp=${ds//-//} year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/Sikkom/corpus/$ds cd $PART/$dp
cd /net/corpora/nlnieuws/Sikkom/$dp
ln -s lock.$$ lock ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ] if [ "`readlink lock`" != lock.$$ ]
@@ -51,7 +56,7 @@ cd out
mkdir xml mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata metadata
cd xml cd xml
rm -f $corpus.data.dz $corpus.index rm -f $corpus.data.dz $corpus.index

View File

@@ -77,7 +77,7 @@ func main() {
} }
p(err) p(err)
year, week := t.ISOWeek() year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Tzum/%d/%02d", year, week) dirname := fmt.Sprintf("/net/corpora/nlnieuws/Tzum/%d/w%02d", year, week)
if exists(dirname + "/lock") { if exists(dirname + "/lock") {
continue continue
} }

View File

@@ -23,7 +23,7 @@ type Item struct {
var ( var (
x = e.ExitErr x = e.ExitErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`) reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
) )
func main() { func main() {
@@ -32,16 +32,16 @@ func main() {
switch len(os.Args) { switch len(os.Args) {
case 1: case 1:
year, week := time.Now().AddDate(0, 0, -7).ISOWeek() year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
ds = fmt.Sprintf("%d-%02d", year, week) ds = fmt.Sprintf("%d.%02d", year, week)
case 2: case 2:
if !reYearWeek.MatchString(os.Args[1]) { if !reYearWeek.MatchString(os.Args[1]) {
x(fmt.Errorf("arg must be yyyy-ww")) x(fmt.Errorf("arg must be yyyy.ww"))
} }
ds = os.Args[1] ds = os.Args[1]
default: default:
x(fmt.Errorf("too many arguments")) x(fmt.Errorf("too many arguments"))
} }
dp := ds[:4] + "/" + ds[5:] dp := ds[:4] + "/w" + ds[5:]
x(os.Chdir("/net/corpora/nlnieuws/Tzum/" + dp)) x(os.Chdir("/net/corpora/nlnieuws/Tzum/" + dp))
x(os.MkdirAll("out", 0777)) x(os.MkdirAll("out", 0777))

View File

@@ -2,17 +2,20 @@
set -e set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/Tzum
unset CDPATH unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null . /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ] if [ "$1" = "" ]
then then
ds=`date -d -7days +%G-%V` ds=`date -d -7days +%G.%V`
else else
case "$1" in case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9]) 2[0-9][0-9][0-9].[0-5][0-9])
ds=$1 ds=$1
;; ;;
*) *)
@@ -22,11 +25,13 @@ else
esac esac
fi fi
dp=${ds//-//} year=${ds%.*}
week=${ds#*.}
dp=$year/w$week
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/Tzum/corpus/$ds cd $PART/$dp
cd /net/corpora/nlnieuws/Tzum/$dp
ln -s lock.$$ lock ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ] if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +43,7 @@ fi
rm -fr out rm -fr out
mkdir out mkdir out
../../xml2txt $ds xml2txt $ds
rm -f $corpus.lines rm -f $corpus.lines
for i in out/*.txt for i in out/*.txt
@@ -53,14 +58,14 @@ cd out
mkdir xml mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata metadata
cd xml cd xml
rm -f $corpus.data.dz $corpus.index rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin # telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../.. cd ../..
rm -fr out rm -fr out

View File

@@ -110,8 +110,7 @@ func main() {
if t2.After(t) { if t2.After(t) {
t = t2 t = t2
} }
year, week := t.ISOWeek() dirname := fmt.Sprintf("/net/corpora/nlnieuws/VRT/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
dirname := fmt.Sprintf("/net/corpora/nlnieuws/VRT/%d/%02d", year, week)
if exists(dirname + "/lock") { if exists(dirname + "/lock") {
continue continue
} }

View File

@@ -2,17 +2,20 @@
set -e set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/VRT
unset CDPATH unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null . /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ] if [ "$1" = "" ]
then then
ds=`date -d -7days +%G-%V` ds=`date -d -2days +%Y-%m-%d`
else else
case "$1" in case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9]) 2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
ds=$1 ds=$1
;; ;;
*) *)
@@ -23,10 +26,11 @@ else
fi fi
dp=${ds//-//} dp=${ds//-//}
year=${ds%%-*}
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/VRT/corpus/$ds cd $PART/$dp
cd /net/corpora/nlnieuws/VRT/$dp
ln -s lock.$$ lock ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ] if [ "`readlink lock`" != lock.$$ ]
@@ -51,15 +55,15 @@ cd out
mkdir xml mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata metadata
cd xml cd xml
rm -f $corpus.data.dz $corpus.index rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin # telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x C -s $corpus.data.dz > $corpus.cat.txt query.sh -x C -s $corpus.data.dz > $corpus.cat.txt
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../.. cd ../..
rm -fr out rm -fr out

View File

@@ -70,9 +70,9 @@ var (
func main() { func main() {
aa := strings.Split(os.Args[1], "-") aa := strings.Split(os.Args[1], ".")
if len(aa) != 2 { if len(aa) != 2 {
x(fmt.Errorf("ongeldig argument, moet in formaat yyyy-dd zijn")) x(fmt.Errorf("ongeldig argument, moet in formaat yyyy.dd zijn"))
} }
var err error var err error
@@ -128,7 +128,8 @@ func makeParts(source string) *Parts {
func makeValues(source, part string) [][5]any { func makeValues(source, part string) [][5]any {
v := make([][5]any, 0) v := make([][5]any, 0)
filename := fmt.Sprintf("/net/corpora/nlnieuws/data/%s-%s-%d-%02d-%d%s", filename := fmt.Sprintf("/net/corpora/nlnieuws/data/%d/%s-%s-%d-%02d-%d%s",
year,
sources[source], sources[source],
parts[part].file, parts[part].file,
year, year,
@@ -226,7 +227,7 @@ func dates() (start, first, last string, names []string) {
t3 := tStart t3 := tStart
for range size { for range size {
y, w := t3.ISOWeek() y, w := t3.ISOWeek()
names = append(names, fmt.Sprintf("%d/%02d", y, w)) names = append(names, fmt.Sprintf("%d/w%02d", y, w))
t3 = t3.AddDate(0, 0, 7) t3 = t3.AddDate(0, 0, 7)
} }
t3 = tStart t3 = tStart

View File

@@ -26,13 +26,24 @@ var (
func main() { func main() {
files, err := os.ReadDir("/net/corpora/nlnieuws/data/json") dirs, err := os.ReadDir("/net/corpora/nlnieuws/data/json")
x(err) x(err)
for _, dir := range dirs {
if !dir.IsDir() {
continue
}
dirname := dir.Name()
if dirname[0] != '2' {
continue
}
files, err := os.ReadDir("/net/corpora/nlnieuws/data/json/" + dirname)
x(err)
for _, file := range files { for _, file := range files {
filename := file.Name() filename := file.Name()
if strings.HasPrefix(filename, "DATA-") && strings.HasSuffix(filename, "-4.json") { if strings.HasPrefix(filename, "DATA-") && strings.HasSuffix(filename, "-4.json") {
addWeek(filename[5:12]) addWeek(filename[5:12])
}
} }
} }

View File

@@ -15,9 +15,9 @@ var (
) )
func main() { func main() {
aa := strings.Split(os.Args[1], "-") aa := strings.Split(os.Args[1], ".")
if len(aa) != 2 { if len(aa) != 2 {
x(fmt.Errorf("ongeldig argument, moet in formaat yyyy-dd zijn")) x(fmt.Errorf("ongeldig argument, moet in formaat yyyy.dd zijn"))
} }
year, err := strconv.Atoi(aa[0]) year, err := strconv.Atoi(aa[0])
@@ -58,7 +58,7 @@ func main() {
fmt.Print(" -or") fmt.Print(" -or")
} }
y, w := t2.ISOWeek() y, w := t2.ISOWeek()
fmt.Printf(" -name %d-%02d.data.dz", y, w) fmt.Printf(" -name %d.%02d.data.dz", y, w)
t2 = t2.AddDate(0, 0, 7) t2 = t2.AddDate(0, 0, 7)
} }

View File

@@ -22,10 +22,10 @@ say () {
if [ "$1" = "" ] if [ "$1" = "" ]
then then
ds=`date -d -7days +%G-%V` ds=`date -d -7days +%G.%V`
else else
case "$1" in case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9]) 2[0-9][0-9][0-9].[0-5][0-9])
ds=$1 ds=$1
;; ;;
*) *)
@@ -35,7 +35,11 @@ else
esac esac
fi fi
cd /net/corpora/nlnieuws/data year=${ds%%.*}
mkdir -p /net/corpora/nlnieuws/data/$year
mkdir -p /net/corpora/nlnieuws/data/json/$year
cd /net/corpora/nlnieuws/data/$year
declare -A parts declare -A parts
#parts[alles]='.' #parts[alles]='.'
@@ -68,7 +72,7 @@ do
for i in 1 4 for i in 1 4
do do
files=$(find .. $(week2files $ds $i) | grep -E "$regex") || true files=$(find ../.. $(week2files $ds $i) | grep -E "$regex") || true
if [ -z "$files" ] if [ -z "$files" ]
then then
continue continue
@@ -161,10 +165,10 @@ do
done done
done done
data2json $ds 1 > json/DATA-$ds-1.json data2json $ds 1 > ../json/$year/DATA-$ds-1.json
data2json $ds 4 > json/DATA-$ds-4.json data2json $ds 4 > ../json/$year/DATA-$ds-4.json
dates2json > json/index1.json dates2json > ../json/index1.json
dates2json > json/index4.json dates2json > ../json/index4.json
# rechten bijwerken # rechten bijwerken
chmod -R g+w /net/corpora/nlnieuws chmod -R g+w /net/corpora/nlnieuws

View File

@@ -17,8 +17,11 @@ var (
func AddEnd(s string) string { func AddEnd(s string) string {
s = strings.TrimSpace(s) s = strings.TrimSpace(s)
if s == "" || reEOL.MatchString(s) { if s == "" {
return s return ""
}
if reEOL.MatchString(s) {
return s + "\n"
} }
return s + ".\n" return s + ".\n"
} }

View File

@@ -133,7 +133,9 @@ function makeTD(title, values) {
async function loadSource(source, week) { async function loadSource(source, week) {
if (!data.has(week)) { if (!data.has(week)) {
data[week] = await getJSON('DATA-' + week + '-4.json') data[week] = await getJSON(
week.substring(0, 4) + '/DATA-' + week + '-4.json',
)
} }
idSubtitle.innerHTML = source + ' — t/m ' + data[week].last idSubtitle.innerHTML = source + ' — t/m ' + data[week].last
@@ -154,7 +156,9 @@ async function loadSource(source, week) {
async function loadPart(part, week) { async function loadPart(part, week) {
if (!data.has(week)) { if (!data.has(week)) {
data[week] = await getJSON('DATA-' + week + '-4.json') data[week] = await getJSON(
week.substring(0, 4) + '/DATA-' + week + '-4.json',
)
} }
idSubtitle.innerHTML = part + ' — t/m ' + data[week].last idSubtitle.innerHTML = part + ' — t/m ' + data[week].last
@@ -185,7 +189,9 @@ async function loadWeken(source, part) {
if (i < dates.length) { if (i < dates.length) {
var week = dates[i].week var week = dates[i].week
if (!data.has(week)) { if (!data.has(week)) {
data[week] = await getJSON('DATA-' + week + '-4.json') data[week] = await getJSON(
week.substring(0, 4) + '/DATA-' + week + '-4.json',
)
} }
var values = data[week][source][part] var values = data[week][source][part]
tr.appendChild(makeTD('t/m ' + data[week].last, values)) tr.appendChild(makeTD('t/m ' + data[week].last, values))