grote reorganisatie:

- HLN, NOS, NU, VRT: per week -> per dag
- yyyy-ww -> yyyy.ww
- yyyy*  -> yyyy/yyyy*
etc
This commit is contained in:
Peter Kleiweg
2026-05-27 22:42:03 +02:00
parent e430ff576b
commit 5c651387af
46 changed files with 328 additions and 227 deletions

View File

@@ -28,7 +28,7 @@ type Item struct {
var (
x = e.ExitErr
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9]$`)
)
func main() {
@@ -36,17 +36,17 @@ func main() {
var ds string
switch len(os.Args) {
case 1:
year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
ds = fmt.Sprintf("%d-%02d", year, week)
t := time.Now().AddDate(0, 0, -2)
ds = fmt.Sprintf("%d-%02d-%02d", t.Year(), int(t.Month()), t.Day())
case 2:
if !reYearWeek.MatchString(os.Args[1]) {
x(fmt.Errorf("arg must be yyyy-ww"))
x(fmt.Errorf("arg must be yyyy-mm-dd"))
}
ds = os.Args[1]
default:
x(fmt.Errorf("too many arguments"))
}
dp := ds[:4] + "/" + ds[5:]
dp := strings.ReplaceAll(ds, "-", "/")
x(os.Chdir("/net/corpora/nlnieuws/NOS/" + dp))
x(os.MkdirAll("out", 0777))

View File

@@ -94,8 +94,7 @@ func main() {
}
}
p(err)
year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NOS/%d/%02d", year, week)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NOS/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
if exists(dirname + "/lock") {
continue
}

View File

@@ -2,17 +2,20 @@
set -e
BASE=/net/corpora/nlnieuws
PART=$BASE/NOS
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
. /net/aps/etc/alpino-activate.sh > /dev/null
if [ "$1" = "" ]
then
ds=`date -d -7days +%G-%V`
ds=`date -d -2days +%Y-%m-%d`
else
case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9])
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
ds=$1
;;
*)
@@ -23,10 +26,11 @@ else
fi
dp=${ds//-//}
year=${ds%%-*}
corpus=$PART/corpus/$year/$ds
mkdir -p $PART/corpus/$year
corpus=/net/corpora/nlnieuws/NOS/corpus/$ds
cd /net/corpora/nlnieuws/NOS/$dp
cd $PART/$dp
ln -s lock.$$ lock
if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +42,7 @@ fi
rm -fr out
mkdir out
../../json2txt $ds
json2txt $ds
rm -f $corpus.lines
for i in out/*.txt
@@ -53,15 +57,15 @@ cd out
mkdir xml
Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
../../../metadata
metadata
cd xml
rm -f $corpus.data.dz $corpus.index
alto -q -o $corpus.data.dz *.xml
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x C -s $corpus.data.dz > $corpus.cat.txt
/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
query.sh -x C -s $corpus.data.dz > $corpus.cat.txt
query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
cd ../..
rm -fr out