stijgers2json.py

collect.sh: stijgers
weekadd
2026-06-19 17:53:03 +02:00 · 2026-06-19 13:41:28 +02:00 · 2026-06-19 12:59:45 +02:00 · 2026-06-18 14:49:00 +02:00 · 2026-06-18 12:52:40 +02:00 · 2026-06-10 13:31:55 +02:00
109 changed files with 26801 additions and 1409 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -38,15 +38,19 @@ Sikkom/sikkom
 Tzum/metadata
 Tzum/tzum
 Tzum/xml2txt
+Volkskrant/metadata
+Volkskrant/volkskrant
 VRT/metadata
 VRT/vrt
 bin/data2json
 bin/dates2json
 bin/flush
 bin/items2count
-bin/score
+bin/rang
 bin/top20
+bin/trends
 bin/week2files
+bin/weekadd
 20??
 corpus
 data
--- a/AT5/Makefile
+++ b/AT5/Makefile
@@ -3,11 +3,11 @@ all: \
 	metadata \
 	at5

-xml2txt: cmd/xml2txt/*.go
-	go build -o $@ $^
+xml2txt: cmd/xml2txt/*.go ../internal/util/*.go
+	go build -o $@ $<

 metadata: cmd/metadata/*.go
 	go build -o $@ $^

-at5: cmd/at5/*.go
-	go build -o $@ $^
+at5: cmd/at5/*.go ../internal/util/*.go
+	go build -o $@ $<
--- a/AT5/cmd/at5/at5.go
+++ b/AT5/cmd/at5/at5.go
@@ -3,13 +3,14 @@ package main
 import (
 	e "codeberg.org/pebbe/errors"

+	u "git.web.rug.nl/p209327/nlnieuws/internal/util"
+
 	"encoding/xml"
 	"fmt"
 	"io"
 	"net/http"
 	"net/url"
 	"os"
-	"path/filepath"
 	"strings"
 	"time"
 )
@@ -46,7 +47,7 @@ func main() {
 	}()

 	myLock := "/net/corpora/nlnieuws/AT5/lock"
-	mkLock(myLock)
+	u.MkLock(myLock)
 	defer func() {
 		_ = os.Remove(myLock)
 	}()
@@ -76,7 +77,7 @@ func main() {
 		}
 		p(err)
 		year, week := t.ISOWeek()
-		dirname := fmt.Sprintf("/net/corpora/nlnieuws/AT5/%d/%02d", year, week)
+		dirname := fmt.Sprintf("/net/corpora/nlnieuws/AT5/%d/w%02d", year, week)
 		if exists(dirname + "/lock") {
 			continue
 		}
@@ -110,16 +111,3 @@ func main() {
 		}()
 	}
 }
-
-func mkLock(filename string) {
-	pid := os.Getpid()
-	link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
-	p(os.Symlink(link, filename))
-
-	name, err := os.Readlink(filename)
-	p(err)
-
-	if name != link {
-		p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
-	}
-}
--- a/AT5/cmd/xml2txt/xml2txt.go
+++ b/AT5/cmd/xml2txt/xml2txt.go
@@ -4,6 +4,8 @@ import (
 	e "codeberg.org/pebbe/errors"
 	"github.com/jbowtie/gokogiri"

+	u "git.web.rug.nl/p209327/nlnieuws/internal/util"
+
 	"encoding/xml"
 	"fmt"
 	"os"
@@ -20,7 +22,7 @@ type Item struct {
 var (
 	x = e.ExitErr

-	reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
+	reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
 )

 func main() {
@@ -29,16 +31,16 @@ func main() {
 	switch len(os.Args) {
 	case 1:
 		year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
-		ds = fmt.Sprintf("%d-%02d", year, week)
+		ds = fmt.Sprintf("%d.%02d", year, week)
 	case 2:
 		if !reYearWeek.MatchString(os.Args[1]) {
-			x(fmt.Errorf("arg must be yyyy-ww"))
+			x(fmt.Errorf("arg must be yyyy.ww"))
 		}
 		ds = os.Args[1]
 	default:
 		x(fmt.Errorf("too many arguments"))
 	}
-	dp := ds[:4] + "/" + ds[5:]
+	dp := ds[:4] + "/w" + ds[5:]

 	x(os.Chdir("/net/corpora/nlnieuws/AT5/" + dp))
 	x(os.MkdirAll("out", 0777))
@@ -55,39 +57,15 @@ func main() {
 		x(err)
 		var item Item
 		x(xml.Unmarshal(b, &item), filename)
-		x(fp.WriteString(addEnd(fixSpace(item.Title))))
-		doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
+		x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
+		doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
 		x(err)
 		root := doc.Root()
 		pp, err := root.Search(`//body/p | //body/h2`)
 		x(err)
 		for _, p := range pp {
-			x(fp.WriteString(addEnd(fixSpace(p.Content()))))
+			x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content()))))
 		}
 		x(fp.Close())
 	}
 }
-
-func addEnd(s string) string {
-	s = strings.TrimSpace(s)
-	n := len(s)
-	if n == 0 {
-		return ""
-	}
-	if n > 0 {
-		if strings.ContainsAny(s[n-1:], ".!?") {
-			return s + "\n"
-		}
-	}
-	if n > 1 {
-		s2 := s[n-2:]
-		if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
-			return s + "\n"
-		}
-	}
-	return s + ".\n"
-}
-
-func fixSpace(s string) string {
-	return strings.Join(strings.Fields(s), " ")
-}
--- a/AT5/txt2corpus.sh
+++ b/AT5/txt2corpus.sh
@@ -2,17 +2,20 @@

 set -e

+BASE=/net/corpora/nlnieuws
+PART=$BASE/AT5
+
 unset CDPATH
-PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
+PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
 export TZ=Europe/Amsterdam
 . /net/aps/etc/alpino-activate.sh > /dev/null

 if [ "$1" = "" ]
 then
-    ds=`date -d -7days +%G-%V`
+    ds=`date -d -7days +%G.%V`
 else
    case "$1" in
-        2[0-9][0-9][0-9]-[0-5][0-9])
+        2[0-9][0-9][0-9].[0-5][0-9])
            ds=$1
            ;;
        *)
@@ -22,11 +25,13 @@ else
    esac
 fi

-dp=${ds//-//}
+year=${ds%.*}
+week=${ds#*.}
+dp=$year/w$week
+corpus=$PART/corpus/$year/$ds
+mkdir -p $PART/corpus/$year

-corpus=/net/corpora/nlnieuws/AT5/corpus/$ds
-
-cd /net/corpora/nlnieuws/AT5/$dp
+cd $PART/$dp

 ln -s lock.$$ lock
 if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +43,7 @@ fi
 rm -fr out
 mkdir out

-../../xml2txt $ds
+xml2txt $ds

 rm -f $corpus.lines
 for i in out/*.txt
@@ -53,7 +58,7 @@ cd out
 mkdir xml
 Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log

-../../../metadata
+metadata

 cd xml
 rm -f $corpus.data.dz $corpus.index
--- a/BuurtAdam/Makefile
+++ b/BuurtAdam/Makefile
@@ -5,5 +5,5 @@ all: \
 metadata: cmd/metadata/*.go
 	go build -o $@ $^

-buurtadam: cmd/buurtadam/*.go
-	go build -o $@ $^
+buurtadam: cmd/buurtadam/*.go ../internal/util/*.go
+	go build -o $@ $<
--- a/BuurtAdam/cmd/buurtadam/buurtadam.go
+++ b/BuurtAdam/cmd/buurtadam/buurtadam.go
@@ -4,13 +4,14 @@ import (
 	e "codeberg.org/pebbe/errors"
 	"github.com/jbowtie/gokogiri"

+	u "git.web.rug.nl/p209327/nlnieuws/internal/util"
+
 	"encoding/xml"
 	"fmt"
 	"io"
 	"net/http"
 	"net/url"
 	"os"
-	"path/filepath"
 	"strings"
 	"time"
 )
@@ -64,7 +65,7 @@ func main() {
 	}()

 	myLock := "/net/corpora/nlnieuws/BuurtAdam/lock"
-	mkLock(myLock)
+	u.MkLock(myLock)
 	defer func() {
 		_ = os.Remove(myLock)
 	}()
@@ -94,7 +95,7 @@ func main() {
 		}
 		p(err)
 		year, week := t.ISOWeek()
-		dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtAdam/%d/%02d", year, week)
+		dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtAdam/%d/w%02d", year, week)
 		if exists(dirname + "/lock") {
 			continue
 		}
@@ -158,6 +159,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
 	p(err)
 	p(resp.Body.Close())

+	body = u.HtmlFix(body)
+
 	doc, err := gokogiri.ParseHtml(body)
 	p(err)

@@ -202,7 +205,7 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
 	}

 	for _, div := range divs {
-		p(fp.WriteString(addEnd(fixSpace(div.Content()))))
+		p(fp.WriteString(u.AddEnd(u.FixSpace(div.Content()))))
 	}

 	p(fp.Close())
@@ -211,40 +214,3 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n

 	return true
 }
-
-func addEnd(s string) string {
-	s = strings.TrimSpace(s)
-	n := len(s)
-	if n == 0 {
-		return ""
-	}
-	if n > 0 {
-		if strings.ContainsAny(s[n-1:], ".!?") {
-			return s + "\n"
-		}
-	}
-	if n > 1 {
-		s2 := s[n-2:]
-		if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
-			return s + "\n"
-		}
-	}
-	return s + ".\n"
-}
-
-func fixSpace(s string) string {
-	return strings.Join(strings.Fields(s), " ")
-}
-
-func mkLock(filename string) {
-	pid := os.Getpid()
-	link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
-	p(os.Symlink(link, filename))
-
-	name, err := os.Readlink(filename)
-	p(err)
-
-	if name != link {
-		p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
-	}
-}
--- a/BuurtAdam/txt2corpus.sh
+++ b/BuurtAdam/txt2corpus.sh
@@ -2,17 +2,20 @@

 set -e

+BASE=/net/corpora/nlnieuws
+PART=$BASE/BuurtAdam
+
 unset CDPATH
-PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
+PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
 export TZ=Europe/Amsterdam
 . /net/aps/etc/alpino-activate.sh > /dev/null

 if [ "$1" = "" ]
 then
-    ds=`date -d -7days +%G-%V`
+    ds=`date -d -7days +%G.%V`
 else
    case "$1" in
-        2[0-9][0-9][0-9]-[0-5][0-9])
+        2[0-9][0-9][0-9].[0-5][0-9])
            ds=$1
            ;;
        *)
@@ -22,11 +25,13 @@ else
    esac
 fi

-dp=${ds//-//}
+year=${ds%.*}
+week=${ds#*.}
+dp=$year/w$week
+corpus=$PART/corpus/$year/$ds
+mkdir -p $PART/corpus/$year

-corpus=/net/corpora/nlnieuws/BuurtAdam/corpus/$ds
-
-cd /net/corpora/nlnieuws/BuurtAdam/$dp
+cd $PART/$dp

 ln -s lock.$$ lock
 if [ "`readlink lock`" != lock.$$ ]
@@ -51,14 +56,14 @@ cd out
 mkdir xml
 Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log

-../../../metadata
+metadata

 cd xml
 rm -f $corpus.data.dz $corpus.index
 alto -q -o $corpus.data.dz *.xml

 # telling per bericht, niet per zin
-/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
+query.sh -x T -s $corpus.data.dz > $corpus.tag.txt

 cd ../..
 rm -fr out
--- a/BuurtGrn/Makefile
+++ b/BuurtGrn/Makefile
@@ -5,5 +5,5 @@ all: \
 metadata: cmd/metadata/*.go
 	go build -o $@ $^

-buurtgrn: cmd/buurtgrn/*.go
-	go build -o $@ $^
+buurtgrn: cmd/buurtgrn/*.go ../internal/util/*.go
+	go build -o $@ $<
--- a/BuurtGrn/cmd/buurtgrn/buurtgrn.go
+++ b/BuurtGrn/cmd/buurtgrn/buurtgrn.go
@@ -6,11 +6,11 @@ import (

 	"encoding/xml"
 	"fmt"
+	u "git.web.rug.nl/p209327/nlnieuws/internal/util"
 	"io"
 	"net/http"
 	"net/url"
 	"os"
-	"path/filepath"
 	"strings"
 	"time"
 )
@@ -64,7 +64,7 @@ func main() {
 	}()

 	myLock := "/net/corpora/nlnieuws/BuurtGrn/lock"
-	mkLock(myLock)
+	u.MkLock(myLock)
 	defer func() {
 		_ = os.Remove(myLock)
 	}()
@@ -94,7 +94,7 @@ func main() {
 		}
 		p(err)
 		year, week := t.ISOWeek()
-		dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtGrn/%d/%02d", year, week)
+		dirname := fmt.Sprintf("/net/corpora/nlnieuws/BuurtGrn/%d/w%02d", year, week)
 		if exists(dirname + "/lock") {
 			continue
 		}
@@ -158,6 +158,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
 	p(err)
 	p(resp.Body.Close())

+	body = u.HtmlFix(body)
+
 	doc, err := gokogiri.ParseHtml(body)
 	p(err)

@@ -202,7 +204,7 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
 	}

 	for _, div := range divs {
-		p(fp.WriteString(addEnd(fixSpace(div.Content()))))
+		p(fp.WriteString(u.AddEnd(u.FixSpace(div.Content()))))
 	}

 	p(fp.Close())
@@ -211,40 +213,3 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n

 	return true
 }
-
-func addEnd(s string) string {
-	s = strings.TrimSpace(s)
-	n := len(s)
-	if n == 0 {
-		return ""
-	}
-	if n > 0 {
-		if strings.ContainsAny(s[n-1:], ".!?") {
-			return s + "\n"
-		}
-	}
-	if n > 1 {
-		s2 := s[n-2:]
-		if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
-			return s + "\n"
-		}
-	}
-	return s + ".\n"
-}
-
-func fixSpace(s string) string {
-	return strings.Join(strings.Fields(s), " ")
-}
-
-func mkLock(filename string) {
-	pid := os.Getpid()
-	link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
-	p(os.Symlink(link, filename))
-
-	name, err := os.Readlink(filename)
-	p(err)
-
-	if name != link {
-		p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
-	}
-}
--- a/BuurtGrn/txt2corpus.sh
+++ b/BuurtGrn/txt2corpus.sh
@@ -2,17 +2,20 @@

 set -e

+BASE=/net/corpora/nlnieuws
+PART=$BASE/BuurtGrn
+
 unset CDPATH
-PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
+PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
 export TZ=Europe/Amsterdam
 . /net/aps/etc/alpino-activate.sh > /dev/null

 if [ "$1" = "" ]
 then
-    ds=`date -d -7days +%G-%V`
+    ds=`date -d -7days +%G.%V`
 else
    case "$1" in
-        2[0-9][0-9][0-9]-[0-5][0-9])
+        2[0-9][0-9][0-9].[0-5][0-9])
            ds=$1
            ;;
        *)
@@ -22,11 +25,13 @@ else
    esac
 fi

-dp=${ds//-//}
+year=${ds%.*}
+week=${ds#*.}
+dp=$year/w$week
+corpus=$PART/corpus/$year/$ds
+mkdir -p $PART/corpus/$year

-corpus=/net/corpora/nlnieuws/BuurtGrn/corpus/$ds
-
-cd /net/corpora/nlnieuws/BuurtGrn/$dp
+cd $PART/$dp

 ln -s lock.$$ lock
 if [ "`readlink lock`" != lock.$$ ]
@@ -51,14 +56,14 @@ cd out
 mkdir xml
 Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log

-../../../metadata
+metadata

 cd xml
 rm -f $corpus.data.dz $corpus.index
 alto -q -o $corpus.data.dz *.xml

 # telling per bericht, niet per zin
-/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
+query.sh -x T -s $corpus.data.dz > $corpus.tag.txt

 cd ../..
 rm -fr out
--- a/GG/Makefile
+++ b/GG/Makefile
@@ -5,5 +5,5 @@ all: \
 metadata: cmd/metadata/*.go
 	go build -o $@ $^

-gg: cmd/gg/*.go
-	go build -o $@ $^
+gg: cmd/gg/*.go ../internal/util/*.go
+	go build -o $@ $<
--- a/GG/cmd/gg/gg.go
+++ b/GG/cmd/gg/gg.go
@@ -4,13 +4,14 @@ import (
 	e "codeberg.org/pebbe/errors"
 	"github.com/jbowtie/gokogiri"

+	u "git.web.rug.nl/p209327/nlnieuws/internal/util"
+
 	"encoding/xml"
 	"fmt"
 	"io"
 	"net/http"
 	"net/url"
 	"os"
-	"path/filepath"
 	"strings"
 	"time"
 )
@@ -64,7 +65,7 @@ func main() {
 	}()

 	myLock := "/net/corpora/nlnieuws/GG/lock"
-	mkLock(myLock)
+	u.MkLock(myLock)
 	defer func() {
 		_ = os.Remove(myLock)
 	}()
@@ -94,7 +95,7 @@ func main() {
 		}
 		p(err)
 		year, week := t.ISOWeek()
-		dirname := fmt.Sprintf("/net/corpora/nlnieuws/GG/%d/%02d", year, week)
+		dirname := fmt.Sprintf("/net/corpora/nlnieuws/GG/%d/w%02d", year, week)
 		if exists(dirname + "/lock") {
 			continue
 		}
@@ -154,6 +155,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
 	p(err)
 	p(resp.Body.Close())

+	body = u.HtmlFix(body)
+
 	doc, err := gokogiri.ParseHtml(body)
 	p(err)

@@ -191,10 +194,10 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
 	fp, err := os.Create(filename + ".txt")
 	p(err)

-	p(fp.WriteString(addEnd(fixSpace(title))))
+	p(fp.WriteString(u.AddEnd(u.FixSpace(title))))

 	for _, el := range ell {
-		p(fp.WriteString(addEnd(fixSpace(el.Content()))))
+		p(fp.WriteString(u.AddEnd(u.FixSpace(el.Content()))))
 	}

 	p(fp.Close())
@@ -203,40 +206,3 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n

 	return true
 }
-
-func addEnd(s string) string {
-	s = strings.TrimSpace(s)
-	n := len(s)
-	if n == 0 {
-		return ""
-	}
-	if n > 0 {
-		if strings.ContainsAny(s[n-1:], ".!?") {
-			return s + "\n"
-		}
-	}
-	if n > 1 {
-		s2 := s[n-2:]
-		if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
-			return s + "\n"
-		}
-	}
-	return s + ".\n"
-}
-
-func fixSpace(s string) string {
-	return strings.Join(strings.Fields(s), " ")
-}
-
-func mkLock(filename string) {
-	pid := os.Getpid()
-	link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
-	p(os.Symlink(link, filename))
-
-	name, err := os.Readlink(filename)
-	p(err)
-
-	if name != link {
-		p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
-	}
-}
--- a/GG/txt2corpus.sh
+++ b/GG/txt2corpus.sh
@@ -2,17 +2,20 @@

 set -e

+BASE=/net/corpora/nlnieuws
+PART=$BASE/GG
+
 unset CDPATH
-PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
+PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
 export TZ=Europe/Amsterdam
 . /net/aps/etc/alpino-activate.sh > /dev/null

 if [ "$1" = "" ]
 then
-    ds=`date -d -7days +%G-%V`
+    ds=`date -d -7days +%G.%V`
 else
    case "$1" in
-        2[0-9][0-9][0-9]-[0-5][0-9])
+        2[0-9][0-9][0-9].[0-5][0-9])
            ds=$1
            ;;
        *)
@@ -22,11 +25,13 @@ else
    esac
 fi

-dp=${ds//-//}
+year=${ds%.*}
+week=${ds#*.}
+dp=$year/w$week
+corpus=$PART/corpus/$year/$ds
+mkdir -p $PART/corpus/$year

-corpus=/net/corpora/nlnieuws/GG/corpus/$ds
-
-cd /net/corpora/nlnieuws/GG/$dp
+cd $PART/$dp

 ln -s lock.$$ lock
 if [ "`readlink lock`" != lock.$$ ]
@@ -51,7 +56,7 @@ cd out
 mkdir xml
 Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log

-../../../metadata
+metadata

 cd xml
 rm -f $corpus.data.dz $corpus.index
--- a/HLN/Makefile
+++ b/HLN/Makefile
@@ -5,5 +5,5 @@ all: \
 metadata: cmd/metadata/*.go
 	go build -o $@ $^

-hln: cmd/hln/*.go
-	go build -o $@ $^
+hln: cmd/hln/*.go ../internal/util/*.go
+	go build -o $@ $<
--- a/HLN/cmd/hln/hln.go
+++ b/HLN/cmd/hln/hln.go
@@ -4,13 +4,16 @@ import (
 	e "codeberg.org/pebbe/errors"
 	"github.com/jbowtie/gokogiri"

+	u "git.web.rug.nl/p209327/nlnieuws/internal/util"
+
+	// "encoding/json"
 	"encoding/xml"
 	"fmt"
+	// "html"
 	"io"
 	"net/http"
 	"net/url"
 	"os"
-	"path/filepath"
 	"strings"
 	"time"
 )
@@ -25,9 +28,16 @@ type ItemT struct {
 	UnixTime int64  `xml:"unixTime"`
 	Guid     string `xml:"guid"`
 	Link     string `xml:"link"`
+	Title    string `xml:"title"`
 	Data     []byte `xml:",innerxml"`
 }

+/*
+type GraphT struct {
+	Graph []map[string]any `json:"@graph"`
+}
+*/
+
 var (
 	p     = e.PanicErr
 	w     = e.WarnErr
@@ -62,7 +72,7 @@ func main() {
 	}()

 	myLock := "/net/corpora/nlnieuws/HLN/lock"
-	mkLock(myLock)
+	u.MkLock(myLock)
 	defer func() {
 		_ = os.Remove(myLock)
 	}()
@@ -91,8 +101,7 @@ func main() {
 			t, err = time.Parse(time.RFC1123, item.PubDate)
 		}
 		p(err)
-		year, week := t.ISOWeek()
-		dirname := fmt.Sprintf("/net/corpora/nlnieuws/HLN/%d/%02d", year, week)
+		dirname := fmt.Sprintf("/net/corpora/nlnieuws/HLN/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
 		if exists(dirname + "/lock") {
 			continue
 		}
@@ -127,18 +136,19 @@ func main() {
 			p(fp.WriteString("</item>\n"))
 			p(fp.Close())
 			p(os.Chtimes(filename+".xml", t, t))
-			ok = doArticle(filename, item.Link, t, needUpdate)
+			ok = doArticle(filename, item.Link, item.Title, t, needUpdate)
 		}()
 	}
 }

-func doArticle(filename string, url string, timestamp time.Time, needUpdate bool) (ok bool) {
+func doArticle(filename string, url string, title string, timestamp time.Time, needUpdate bool) (ok bool) {
 	if exists(filename + ".skip") {
 		return true
 	}
 	if needUpdate {
 		_ = os.Remove(filename + ".err")
 		_ = os.Remove(filename + ".html")
+		// _ = os.Remove(filename + ".json")
 		_ = os.Remove(filename + ".txt")
 	} else {
 		if exists(filename + ".txt") {
@@ -158,6 +168,62 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
 	p(err)
 	p(resp.Body.Close())

+	body = u.HtmlFix(body)
+
+	/*
+		s := string(body)
+		ok = true
+		i1 := strings.Index(s, `type="application/ld+json"`)
+		if i1 < 0 {
+			ok = false
+		} else {
+			i1 += strings.Index(s[i1:], `>`) + 1
+			i2 := i1 + strings.Index(s[i1:], `</script>`)
+			if i2 < i1 {
+				ok = false
+			} else {
+				s = html.UnescapeString(s[i1:i2])
+			}
+		}
+		if !ok {
+			_ = w(fmt.Errorf("script jsonld not found: %s", url))
+
+			fp, err := os.Create(filename + ".err")
+			p(err)
+			p(fmt.Fprintf(fp, "script jsonld not found: %s\n", url))
+			p(fp.Close())
+			p(os.Chtimes(filename+".err", timestamp, timestamp))
+
+			fp, err = os.Create(filename + ".html")
+			p(err)
+			p(fp.Write(body))
+			p(fp.Close())
+			p(os.Chtimes(filename+".html", timestamp, timestamp))
+
+			return false
+		}
+
+		var graph GraphT
+		p(json.Unmarshal([]byte(s), &graph))
+		for _, g := range graph.Graph {
+			t := g["@type"]
+			switch v := t.(type) {
+			case string:
+				if v == "NewsArticle" {
+					b, err := json.Marshal(g)
+					p(err)
+					s = string(b)
+				}
+			}
+		}
+
+		fp, err := os.Create(filename + ".json")
+		p(err)
+		p(fp.WriteString(s))
+		p(fp.Close())
+		p(os.Chtimes(filename+".json", timestamp, timestamp))
+	*/
+
 	doc, err := gokogiri.ParseHtml(body)
 	p(err)

@@ -196,18 +262,6 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool

 	pars := make([]string, 0)

-	ell, err = article.Search(`.//*[@data-content-type="TITLE"]`)
-	p(err)
-	if len(ell) != 1 {
-		_ = w(fmt.Errorf("found %d titles: %s", len(ell), url))
-	}
-	for _, el := range ell {
-		s := strings.TrimSpace(el.Content())
-		if s != "" {
-			pars = append(pars, s)
-		}
-	}
-
 	hasIntro := false
 	ell, err = article.Search(`.//*[@data-content-type="INTRO"]`)
 	p(err)
@@ -224,8 +278,8 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool

 	specials, err := article.Search(`.//*[@data-content-type="GROUP"]`)
 	p(err)
-	for _, special := range specials {
-		special.Remove()
+	for i := len(specials) - 1; i >= 0; i-- {
+		specials[i].Remove()
 	}

 	other, err := article.Search(`.//*[@data-content-type="PODCAST"]`)
@@ -285,12 +339,14 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
 		p(fmt.Fprintln(fp, "##META text tag ="))
 	} else {
 		for _, tag := range tags {
-			p(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag)))
+			p(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
 		}
 	}

+	p(fp.WriteString(u.AddEnd(u.FixSpace(title))))
+
 	for _, par := range pars {
-		p(fp.WriteString(addEnd(fixSpace(par))))
+		p(fp.WriteString(u.AddEnd(u.FixSpace(par))))
 	}

 	p(fp.Close())
@@ -299,43 +355,3 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool

 	return true
 }
-
-func addEnd(s string) string {
-	s = strings.TrimSpace(s)
-	n := len(s)
-	if n == 0 {
-		return ""
-	}
-	if n > 0 {
-		if strings.ContainsAny(s[n-1:], ".!?") {
-			return s + "\n"
-		}
-	}
-	if n > 1 {
-		s2 := s[n-2:]
-		if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
-			return s + "\n"
-		}
-	}
-	if strings.HasSuffix(s, `.”`) || strings.HasSuffix(s, `!”`) || strings.HasSuffix(s, `?”`) {
-		return s + "\n"
-	}
-	return s + ".\n"
-}
-
-func fixSpace(s string) string {
-	return strings.Join(strings.Fields(s), " ")
-}
-
-func mkLock(filename string) {
-	pid := os.Getpid()
-	link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
-	p(os.Symlink(link, filename))
-
-	name, err := os.Readlink(filename)
-	p(err)
-
-	if name != link {
-		p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
-	}
-}
--- a/HLN/txt2corpus.sh
+++ b/HLN/txt2corpus.sh
@@ -2,17 +2,20 @@

 set -e

+BASE=/net/corpora/nlnieuws
+PART=$BASE/HLN
+
 unset CDPATH
-PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
+PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
 export TZ=Europe/Amsterdam
 . /net/aps/etc/alpino-activate.sh > /dev/null

 if [ "$1" = "" ]
 then
-    ds=`date -d -7days +%G-%V`
+    ds=`date -d -2days +%Y-%m-%d`
 else
    case "$1" in
-        2[0-9][0-9][0-9]-[0-5][0-9])
+        2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
            ds=$1
            ;;
        *)
@@ -23,10 +26,11 @@ else
 fi

 dp=${ds//-//}
+year=${ds%%-*}
+corpus=$PART/corpus/$year/$ds
+mkdir -p $PART/corpus/$year

-corpus=/net/corpora/nlnieuws/HLN/corpus/$ds
-
-cd /net/corpora/nlnieuws/HLN/$dp
+cd $PART/$dp

 ln -s lock.$$ lock
 if [ "`readlink lock`" != lock.$$ ]
@@ -51,14 +55,14 @@ cd out
 mkdir xml
 Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log

-../../../metadata
+metadata

 cd xml
 rm -f $corpus.data.dz $corpus.index
 alto -q -o $corpus.data.dz *.xml

 # telling per bericht, niet per zin
-/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
+query.sh -x T -s $corpus.data.dz > $corpus.tag.txt

 cd ../..
 rm -fr out
--- a/LitNL/Makefile
+++ b/LitNL/Makefile
@@ -3,11 +3,11 @@ all: \
 	metadata \
 	litnl

-xml2txt: cmd/xml2txt/*.go
-	go build -o $@ $^
+xml2txt: cmd/xml2txt/*.go ../internal/util/*.go
+	go build -o $@ $<

 metadata: cmd/metadata/*.go
 	go build -o $@ $^

-litnl: cmd/litnl/*.go
-	go build -o $@ $^
+litnl: cmd/litnl/*.go ../internal/util/*.go
+	go build -o $@ $<
--- a/LitNL/cmd/litnl/litnl.go
+++ b/LitNL/cmd/litnl/litnl.go
@@ -3,13 +3,14 @@ package main
 import (
 	e "codeberg.org/pebbe/errors"

+	u "git.web.rug.nl/p209327/nlnieuws/internal/util"
+
 	"encoding/xml"
 	"fmt"
 	"io"
 	"net/http"
 	"net/url"
 	"os"
-	"path/filepath"
 	"strings"
 	"time"
 )
@@ -46,7 +47,7 @@ func main() {
 	}()

 	myLock := "/net/corpora/nlnieuws/LitNL/lock"
-	mkLock(myLock)
+	u.MkLock(myLock)
 	defer func() {
 		_ = os.Remove(myLock)
 	}()
@@ -76,7 +77,7 @@ func main() {
 		}
 		p(err)
 		year, week := t.ISOWeek()
-		dirname := fmt.Sprintf("/net/corpora/nlnieuws/LitNL/%d/%02d", year, week)
+		dirname := fmt.Sprintf("/net/corpora/nlnieuws/LitNL/%d/w%02d", year, week)
 		if exists(dirname + "/lock") {
 			continue
 		}
@@ -108,16 +109,3 @@ func main() {
 	}

 }
-
-func mkLock(filename string) {
-	pid := os.Getpid()
-	link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
-	p(os.Symlink(link, filename))
-
-	name, err := os.Readlink(filename)
-	p(err)
-
-	if name != link {
-		p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
-	}
-}
--- a/LitNL/cmd/xml2txt/xml2txt.go
+++ b/LitNL/cmd/xml2txt/xml2txt.go
@@ -4,6 +4,8 @@ import (
 	e "codeberg.org/pebbe/errors"
 	"github.com/jbowtie/gokogiri"

+	u "git.web.rug.nl/p209327/nlnieuws/internal/util"
+
 	"encoding/xml"
 	"fmt"
 	"os"
@@ -22,7 +24,7 @@ var (
 	w = e.WarnErr
 	x = e.ExitErr

-	reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
+	reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
 )

 func main() {
@@ -31,16 +33,16 @@ func main() {
 	switch len(os.Args) {
 	case 1:
 		year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
-		ds = fmt.Sprintf("%d-%02d", year, week)
+		ds = fmt.Sprintf("%d.%02d", year, week)
 	case 2:
 		if !reYearWeek.MatchString(os.Args[1]) {
-			x(fmt.Errorf("arg must be yyyy-ww"))
+			x(fmt.Errorf("arg must be yyyy.ww"))
 		}
 		ds = os.Args[1]
 	default:
 		x(fmt.Errorf("too many arguments"))
 	}
-	dp := ds[:4] + "/" + ds[5:]
+	dp := ds[:4] + "/w" + ds[5:]

 	x(os.Chdir("/net/corpora/nlnieuws/LitNL/" + dp))
 	x(os.MkdirAll("out", 0777))
@@ -58,10 +60,10 @@ func main() {
 		var item Item
 		x(xml.Unmarshal(b, &item))
 		for _, cat := range item.Cats {
-			x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat)))
+			x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat)))
 		}
-		x(fp.WriteString(addEnd(fixSpace(item.Title))))
-		doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
+		x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
+		doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
 		x(err)
 		root := doc.Root()
 		pp, err := root.Search(`//body//p`)
@@ -74,32 +76,8 @@ func main() {
 			_ = w(fmt.Errorf("empty: %s", filename))
 		}
 		for _, p := range pp {
-			x(fp.WriteString(addEnd(fixSpace(p.Content()))))
+			x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content()))))
 		}
 		x(fp.Close())
 	}
 }
-
-func addEnd(s string) string {
-	s = strings.TrimSpace(s)
-	n := len(s)
-	if n == 0 {
-		return ""
-	}
-	if n > 0 {
-		if strings.ContainsAny(s[n-1:], ".!?") {
-			return s + "\n"
-		}
-	}
-	if n > 1 {
-		s2 := s[n-2:]
-		if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
-			return s + "\n"
-		}
-	}
-	return s + ".\n"
-}
-
-func fixSpace(s string) string {
-	return strings.Join(strings.Fields(s), " ")
-}
--- a/LitNL/txt2corpus.sh
+++ b/LitNL/txt2corpus.sh
@@ -2,17 +2,20 @@

 set -e

+BASE=/net/corpora/nlnieuws
+PART=$BASE/LitNL
+
 unset CDPATH
-PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
+PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
 export TZ=Europe/Amsterdam
 . /net/aps/etc/alpino-activate.sh > /dev/null

 if [ "$1" = "" ]
 then
-    ds=`date -d -7days +%G-%V`
+    ds=`date -d -7days +%G.%V`
 else
    case "$1" in
-        2[0-9][0-9][0-9]-[0-5][0-9])
+        2[0-9][0-9][0-9].[0-5][0-9])
            ds=$1
            ;;
        *)
@@ -22,11 +25,13 @@ else
    esac
 fi

-dp=${ds//-//}
+year=${ds%.*}
+week=${ds#*.}
+dp=$year/w$week
+corpus=$PART/corpus/$year/$ds
+mkdir -p $PART/corpus/$year

-corpus=/net/corpora/nlnieuws/LitNL/corpus/$ds
-
-cd /net/corpora/nlnieuws/LitNL/$dp
+cd $PART/$dp

 ln -s lock.$$ lock
 if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +43,7 @@ fi
 rm -fr out
 mkdir out

-../../xml2txt $ds
+xml2txt $ds

 rm -f $corpus.lines
 for i in out/*.txt
@@ -53,14 +58,14 @@ cd out
 mkdir xml
 Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log

-../../../metadata
+metadata

 cd xml
 rm -f $corpus.data.dz $corpus.index
 alto -q -o $corpus.data.dz *.xml

 # telling per bericht, niet per zin
-/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
+query.sh -x T -s $corpus.data.dz > $corpus.tag.txt

 cd ../..
 rm -fr out
--- a/13
+++ b/13
@@ -16,14 +16,17 @@ all:
 	make -C Sargasso
 	make -C Sikkom
 	make -C Tzum
+	make -C Volkskrant
 	make -C VRT
 	make bin/data2json
 	make bin/dates2json
 	make bin/flush
 	make bin/items2count
-	make bin/score
+	make bin/rang
 	make bin/top20
+	make bin/trends
 	make bin/week2files
+	make bin/weekadd

 bin/data2json: cmd/data2json/*.go
 	go build -o $@ $^
@@ -37,12 +40,18 @@ bin/flush: cmd/flush/*.go
 bin/items2count: cmd/items2count/*.go
 	go build -o $@ $^

-bin/score: cmd/score/*.go
+bin/rang: cmd/rang/*.go
 	go build -o $@ $^

 bin/top20: cmd/top20/*.go
 	go build -o $@ $^

+bin/trends: cmd/trends/*.go
+	go build -o $@ $^
+
 bin/week2files: cmd/week2files/*.go
 	go build -o $@ $^

+bin/weekadd: cmd/weekadd/*.go
+	go build -o $@ $^
+
--- a/NOS/Makefile
+++ b/NOS/Makefile
@@ -3,11 +3,11 @@ all: \
 	metadata \
 	nos

-json2txt: cmd/json2txt/*.go
-	go build -o $@ $^
+json2txt: cmd/json2txt/*.go ../internal/util/*.go
+	go build -o $@ $<

 metadata: cmd/metadata/*.go
 	go build -o $@ $^

-nos: cmd/nos/*.go
-	go build -o $@ $^
+nos: cmd/nos/*.go ../internal/util/*.go
+	go build -o $@ $<
--- a/NOS/cmd/json2txt/json2txt.go
+++ b/NOS/cmd/json2txt/json2txt.go
@@ -3,6 +3,8 @@ package main
 import (
 	e "codeberg.org/pebbe/errors"

+	u "git.web.rug.nl/p209327/nlnieuws/internal/util"
+
 	"encoding/json"
 	"fmt"
 	"os"
@@ -26,7 +28,7 @@ type Item struct {
 var (
 	x = e.ExitErr

-	reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
+	reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9]$`)
 )

 func main() {
@@ -34,17 +36,17 @@ func main() {
 	var ds string
 	switch len(os.Args) {
 	case 1:
-		year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
-		ds = fmt.Sprintf("%d-%02d", year, week)
+		t := time.Now().AddDate(0, 0, -2)
+		ds = fmt.Sprintf("%d-%02d-%02d", t.Year(), int(t.Month()), t.Day())
 	case 2:
 		if !reYearWeek.MatchString(os.Args[1]) {
-			x(fmt.Errorf("arg must be yyyy-ww"))
+			x(fmt.Errorf("arg must be yyyy-mm-dd"))
 		}
 		ds = os.Args[1]
 	default:
 		x(fmt.Errorf("too many arguments"))
 	}
-	dp := ds[:4] + "/" + ds[5:]
+	dp := strings.ReplaceAll(ds, "-", "/")

 	x(os.Chdir("/net/corpora/nlnieuws/NOS/" + dp))
 	x(os.MkdirAll("out", 0777))
@@ -61,13 +63,15 @@ func main() {
 		x(err)
 		item := getItem(b, filename)
 		for _, cat := range item.Cats {
-			x(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat)))
+			x(fmt.Fprintf(fp, "##META text cat = %s\n", u.FixSpace(cat)))
 		}
 		for _, tag := range item.Tags {
-			x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag)))
+			x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
+		}
+		x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
+		for _, line := range strings.SplitAfter(item.Text, "\n") {
+			x(fp.WriteString(u.AddEnd(u.FixSpace(line, true))))
 		}
-		x(fp.WriteString(addEnd(fixSpace(item.Title))))
-		x(fp.WriteString(fixSpace(item.Text)))
 		x(fp.Close())
 	}
 }
@@ -90,27 +94,3 @@ func getItem(b []byte, filename string) Item {
 	x(json.Unmarshal(b, &item), filename)
 	return item
 }
-
-func addEnd(s string) string {
-	s = strings.TrimSpace(s)
-	n := len(s)
-	if n == 0 {
-		return ""
-	}
-	if n > 0 {
-		if strings.ContainsAny(s[n-1:], ".!?") {
-			return s + "\n"
-		}
-	}
-	if n > 1 {
-		s2 := s[n-2:]
-		if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
-			return s + "\n"
-		}
-	}
-	return s + ".\n"
-}
-
-func fixSpace(s string) string {
-	return strings.Join(strings.Fields(s), " ")
-}
--- a/NOS/cmd/nos/nos.go
+++ b/NOS/cmd/nos/nos.go
@@ -1,17 +1,17 @@
 package main

 import (
-	"html"
-
 	e "codeberg.org/pebbe/errors"

+	u "git.web.rug.nl/p209327/nlnieuws/internal/util"
+
 	"encoding/xml"
 	"fmt"
+	"html"
 	"io"
 	"net/http"
 	"net/url"
 	"os"
-	"path/filepath"
 	"strings"
 	"time"
 )
@@ -63,7 +63,7 @@ func main() {
 	}()

 	myLock := "/net/corpora/nlnieuws/NOS/lock"
-	mkLock(myLock)
+	u.MkLock(myLock)
 	defer func() {
 		_ = os.Remove(myLock)
 	}()
@@ -94,8 +94,7 @@ func main() {
 			}
 		}
 		p(err)
-		year, week := t.ISOWeek()
-		dirname := fmt.Sprintf("/net/corpora/nlnieuws/NOS/%d/%02d", year, week)
+		dirname := fmt.Sprintf("/net/corpora/nlnieuws/NOS/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
 		if exists(dirname + "/lock") {
 			continue
 		}
@@ -195,16 +194,3 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
 	p(os.Chtimes(filename+".json", timestamp, timestamp))
 	return true
 }
-
-func mkLock(filename string) {
-	pid := os.Getpid()
-	link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
-	p(os.Symlink(link, filename))
-
-	name, err := os.Readlink(filename)
-	p(err)
-
-	if name != link {
-		p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
-	}
-}
--- a/NOS/txt2corpus.sh
+++ b/NOS/txt2corpus.sh
@@ -2,17 +2,20 @@

 set -e

+BASE=/net/corpora/nlnieuws
+PART=$BASE/NOS
+
 unset CDPATH
-PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
+PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
 export TZ=Europe/Amsterdam
 . /net/aps/etc/alpino-activate.sh > /dev/null

 if [ "$1" = "" ]
 then
-    ds=`date -d -7days +%G-%V`
+    ds=`date -d -2days +%Y-%m-%d`
 else
    case "$1" in
-        2[0-9][0-9][0-9]-[0-5][0-9])
+        2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
            ds=$1
            ;;
        *)
@@ -23,10 +26,11 @@ else
 fi

 dp=${ds//-//}
+year=${ds%%-*}
+corpus=$PART/corpus/$year/$ds
+mkdir -p $PART/corpus/$year

-corpus=/net/corpora/nlnieuws/NOS/corpus/$ds
-
-cd /net/corpora/nlnieuws/NOS/$dp
+cd $PART/$dp

 ln -s lock.$$ lock
 if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +42,7 @@ fi
 rm -fr out
 mkdir out

-../../json2txt $ds
+json2txt $ds

 rm -f $corpus.lines
 for i in out/*.txt
@@ -53,15 +57,15 @@ cd out
 mkdir xml
 Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log

-../../../metadata
+metadata

 cd xml
 rm -f $corpus.data.dz $corpus.index
 alto -q -o $corpus.data.dz *.xml

 # telling per bericht, niet per zin
-/net/corpora/nlnieuws/namen.sh -x C -s $corpus.data.dz > $corpus.cat.txt
-/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
+query.sh -x C -s $corpus.data.dz > $corpus.cat.txt
+query.sh -x T -s $corpus.data.dz > $corpus.tag.txt

 cd ../..
 rm -fr out
--- a/NU/Makefile
+++ b/NU/Makefile
@@ -5,5 +5,5 @@ all: \
 metadata: cmd/metadata/*.go
 	go build -o $@ $^

-nu: cmd/nu/*.go
-	go build -o $@ $^
+nu: cmd/nu/*.go ../internal/util/*.go
+	go build -o $@ $<
--- a/NU/cmd/nu/nu.go
+++ b/NU/cmd/nu/nu.go
@@ -3,6 +3,8 @@ package main
 import (
 	e "codeberg.org/pebbe/errors"

+	u "git.web.rug.nl/p209327/nlnieuws/internal/util"
+
 	"bytes"
 	"encoding/json"
 	"encoding/xml"
@@ -12,7 +14,6 @@ import (
 	"net/http"
 	"net/url"
 	"os"
-	"path/filepath"
 	"strings"
 	"time"
 )
@@ -73,7 +74,7 @@ func main() {
 	}()

 	myLock := "/net/corpora/nlnieuws/NU/lock"
-	mkLock(myLock)
+	u.MkLock(myLock)
 	defer func() {
 		_ = os.Remove(myLock)
 	}()
@@ -97,8 +98,7 @@ func main() {
 			t, err = time.Parse(time.RFC1123, item.PubDate)
 		}
 		p(err)
-		year, week := t.ISOWeek()
-		dirname := fmt.Sprintf("/net/corpora/nlnieuws/NU/%d/%02d", year, week)
+		dirname := fmt.Sprintf("/net/corpora/nlnieuws/NU/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
 		if exists(dirname + "/lock") {
 			continue
 		}
@@ -161,6 +161,8 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
 	p(err)
 	p(resp.Body.Close())

+	body = u.HtmlFix(body)
+
 	s := string(body)
 	ok := true
 	i1 := strings.Index(s, `<script type="application/ld+json"`)
@@ -226,22 +228,9 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
 		// text bevat kopjes zonder punt aan het eind
 		lines := strings.Split(text, "\n")
 		for i, line := range lines {
-			line = fixSpace(line)
-			n := len(line)
-			if n > 0 {
-				if strings.ContainsAny(line[n-1:], ".!?") {
-					continue
-				}
-			}
-			if n > 1 {
-				s := line[n-2:]
-				if s == `."` || s == `!"` || s == `?"` {
-					continue
-				}
-			}
-			lines[i] = line + "."
+			lines[i] = u.AddEnd(u.FixSpace(line, true))
 		}
-		text = strings.Join(lines, "\n") + "\n"
+		text = strings.Join(lines, "") + "\n"

 		fp, err := os.Create(filename + ".txt")
 		p(err)
@@ -249,7 +238,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
 			p(fmt.Fprintln(fp, "##META text tag ="))
 		} else {
 			for _, tag := range tags {
-				p(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag)))
+				p(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
 			}
 		}
 		p(fp.WriteString(text))
@@ -259,20 +248,3 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool

 	return true
 }
-
-func fixSpace(s string) string {
-	return strings.Join(strings.Fields(s), " ")
-}
-
-func mkLock(filename string) {
-	pid := os.Getpid()
-	link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
-	p(os.Symlink(link, filename))
-
-	name, err := os.Readlink(filename)
-	p(err)
-
-	if name != link {
-		p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
-	}
-}
--- a/NU/txt2corpus.sh
+++ b/NU/txt2corpus.sh
@@ -2,17 +2,20 @@

 set -e

+BASE=/net/corpora/nlnieuws
+PART=$BASE/NU
+
 unset CDPATH
-PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
+PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
 export TZ=Europe/Amsterdam
 . /net/aps/etc/alpino-activate.sh > /dev/null

 if [ "$1" = "" ]
 then
-    ds=`date -d -7days +%G-%V`
+    ds=`date -d -2days +%Y-%m-%d`
 else
    case "$1" in
-        2[0-9][0-9][0-9]-[0-5][0-9])
+        2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
            ds=$1
            ;;
        *)
@@ -23,10 +26,11 @@ else
 fi

 dp=${ds//-//}
+year=${ds%%-*}
+corpus=$PART/corpus/$year/$ds
+mkdir -p $PART/corpus/$year

-corpus=/net/corpora/nlnieuws/NU/corpus/$ds
-
-cd /net/corpora/nlnieuws/NU/$dp
+cd $PART/$dp

 ln -s lock.$$ lock
 if [ "`readlink lock`" != lock.$$ ]
@@ -51,14 +55,14 @@ cd out
 mkdir xml
 Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log

-../../../metadata
+metadata

 cd xml
 rm -f $corpus.data.dz $corpus.index
 alto -q -o $corpus.data.dz *.xml

 # telling per bericht, niet per zin
-/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
+query.sh -x T -s $corpus.data.dz > $corpus.tag.txt

 cd ../..
 rm -fr out
--- a/NieuwsNL/Makefile
+++ b/NieuwsNL/Makefile
@@ -5,5 +5,5 @@ all: \
 metadata: cmd/metadata/*.go
 	go build -o $@ $^

-nieuwsnl: cmd/nieuwsnl/*.go
-	go build -o $@ $^
+nieuwsnl: cmd/nieuwsnl/*.go ../internal/util/*.go
+	go build -o $@ $<
--- a/NieuwsNL/cmd/nieuwsnl/nieuwsnl.go
+++ b/NieuwsNL/cmd/nieuwsnl/nieuwsnl.go
@@ -4,6 +4,8 @@ import (
 	e "codeberg.org/pebbe/errors"
 	"github.com/jbowtie/gokogiri"

+	u "git.web.rug.nl/p209327/nlnieuws/internal/util"
+
 	"bytes"
 	"encoding/xml"
 	"fmt"
@@ -11,7 +13,6 @@ import (
 	"net/http"
 	"net/url"
 	"os"
-	"path/filepath"
 	"strings"
 	"time"
 )
@@ -64,7 +65,7 @@ func main() {
 	}()

 	myLock := "/net/corpora/nlnieuws/NieuwsNL/lock"
-	mkLock(myLock)
+	u.MkLock(myLock)
 	defer func() {
 		_ = os.Remove(myLock)
 	}()
@@ -153,6 +154,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
 	p(err)
 	p(resp.Body.Close())

+	body = u.HtmlFix(body)
+
 	doc, err := gokogiri.ParseHtml(body)
 	p(err)

@@ -173,11 +176,11 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
 	} else {
 		for _, a := range aa {
 			tag = strings.ReplaceAll(a.Content(), "\n", " ")
-			p(fmt.Fprintf(&buf, "##META text tag = %s\n", fixSpace(tag)))
+			p(fmt.Fprintf(&buf, "##META text tag = %s\n", u.FixSpace(tag)))
 		}
 	}

-	p(buf.WriteString(addEnd(fixSpace(title))))
+	p(buf.WriteString(u.AddEnd(u.FixSpace(title))))

 	// oud: //div[@id="article-blocks"]//p
 	pp, err := root.Search(`//div[@id="article-blocks"]//div[contains(@class, "paragraph-content")]`)
@@ -204,7 +207,7 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
 		return false // echt fout
 	}
 	for _, p1 := range pp {
-		p(buf.WriteString(addEnd(fixSpace(p1.Content()))))
+		p(buf.WriteString(u.AddEnd(u.FixSpace(p1.Content()))))
 	}

 	fp, err := os.Create(filename + ".txt")
@@ -216,40 +219,3 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n

 	return true
 }
-
-func addEnd(s string) string {
-	s = strings.TrimSpace(s)
-	n := len(s)
-	if n == 0 {
-		return ""
-	}
-	if n > 0 {
-		if strings.ContainsAny(s[n-1:], ".!?") {
-			return s + "\n"
-		}
-	}
-	if n > 1 {
-		s2 := s[n-2:]
-		if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
-			return s + "\n"
-		}
-	}
-	return s + ".\n"
-}
-
-func fixSpace(s string) string {
-	return strings.Join(strings.Fields(s), " ")
-}
-
-func mkLock(filename string) {
-	pid := os.Getpid()
-	link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
-	p(os.Symlink(link, filename))
-
-	name, err := os.Readlink(filename)
-	p(err)
-
-	if name != link {
-		p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
-	}
-}
--- a/NieuwsNL/txt2corpus.sh
+++ b/NieuwsNL/txt2corpus.sh
@@ -2,8 +2,11 @@

 set -e

+BASE=/net/corpora/nlnieuws
+PART=$BASE/NieuwsNL
+
 unset CDPATH
-PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
+PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
 export TZ=Europe/Amsterdam
 . /net/aps/etc/alpino-activate.sh > /dev/null

@@ -25,10 +28,11 @@ else
 fi

 dp=${ds//-//}
+year=${ds%%-*}
+corpus=$PART/corpus/$year/$ds
+mkdir -p $PART/corpus/$year

-corpus=/net/corpora/nlnieuws/NieuwsNL/corpus/$ds
-
-cd /net/corpora/nlnieuws/NieuwsNL/$dp
+cd $PART/$dp

 ln -s lock.$$ lock
 if [ "`readlink lock`" != lock.$$ ]
@@ -53,14 +57,14 @@ cd out
 mkdir xml
 Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log

-../../../../metadata
+metadata

 cd xml
 rm -f $corpus.data.dz $corpus.index
 alto -q -o $corpus.data.dz *.xml

 # telling per bericht, niet per zin
-/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
+query.sh -x T -s $corpus.data.dz > $corpus.tag.txt

 cd ../..
 rm -fr out
--- a/Oog/Makefile
+++ b/Oog/Makefile
@@ -3,11 +3,11 @@ all: \
 	metadata \
 	oog

-xml2txt: cmd/xml2txt/*.go
-	go build -o $@ $^
+xml2txt: cmd/xml2txt/*.go ../internal/util/*.go
+	go build -o $@ $<

 metadata: cmd/metadata/*.go
 	go build -o $@ $^

-oog: cmd/oog/*.go
-	go build -o $@ $^
+oog: cmd/oog/*.go ../internal/util/*.go
+	go build -o $@ $<
--- a/Oog/cmd/oog/oog.go
+++ b/Oog/cmd/oog/oog.go
@@ -3,13 +3,14 @@ package main
 import (
 	e "codeberg.org/pebbe/errors"

+	u "git.web.rug.nl/p209327/nlnieuws/internal/util"
+
 	"encoding/xml"
 	"fmt"
 	"io"
 	"net/http"
 	"net/url"
 	"os"
-	"path/filepath"
 	"strings"
 	"time"
 )
@@ -46,7 +47,7 @@ func main() {
 	}()

 	myLock := "/net/corpora/nlnieuws/Oog/lock"
-	mkLock(myLock)
+	u.MkLock(myLock)
 	defer func() {
 		_ = os.Remove(myLock)
 	}()
@@ -76,7 +77,7 @@ func main() {
 		}
 		p(err)
 		year, week := t.ISOWeek()
-		dirname := fmt.Sprintf("/net/corpora/nlnieuws/Oog/%d/%02d", year, week)
+		dirname := fmt.Sprintf("/net/corpora/nlnieuws/Oog/%d/w%02d", year, week)
 		if exists(dirname + "/lock") {
 			continue
 		}
@@ -111,16 +112,3 @@ func main() {
 	}

 }
-
-func mkLock(filename string) {
-	pid := os.Getpid()
-	link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
-	p(os.Symlink(link, filename))
-
-	name, err := os.Readlink(filename)
-	p(err)
-
-	if name != link {
-		p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
-	}
-}
--- a/Oog/cmd/xml2txt/xml2txt.go
+++ b/Oog/cmd/xml2txt/xml2txt.go
@@ -4,6 +4,8 @@ import (
 	e "codeberg.org/pebbe/errors"
 	"github.com/jbowtie/gokogiri"

+	u "git.web.rug.nl/p209327/nlnieuws/internal/util"
+
 	"encoding/xml"
 	"fmt"
 	"os"
@@ -21,7 +23,7 @@ type Item struct {
 var (
 	x = e.ExitErr

-	reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
+	reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
 )

 func main() {
@@ -30,16 +32,16 @@ func main() {
 	switch len(os.Args) {
 	case 1:
 		year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
-		ds = fmt.Sprintf("%d-%02d", year, week)
+		ds = fmt.Sprintf("%d.%02d", year, week)
 	case 2:
 		if !reYearWeek.MatchString(os.Args[1]) {
-			x(fmt.Errorf("arg must be yyyy-ww"))
+			x(fmt.Errorf("arg must be yyyy.ww"))
 		}
 		ds = os.Args[1]
 	default:
 		x(fmt.Errorf("too many arguments"))
 	}
-	dp := ds[:4] + "/" + ds[5:]
+	dp := ds[:4] + "/w" + ds[5:]

 	x(os.Chdir("/net/corpora/nlnieuws/Oog/" + dp))
 	x(os.MkdirAll("out", 0777))
@@ -57,41 +59,20 @@ func main() {
 		var item Item
 		x(xml.Unmarshal(b, &item))
 		for _, cat := range item.Cats {
-			x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat)))
+			t := u.FixSpace(cat)
+			if t != "Nieuws" {
+				x(fmt.Fprintf(fp, "##META text tag = %s\n", t))
+			}
 		}
-		x(fp.WriteString(addEnd(fixSpace(item.Title))))
-		doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
+		x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
+		doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
 		x(err)
 		root := doc.Root()
 		pp, err := root.Search(`//body/p`)
 		x(err)
 		for _, p := range pp {
-			x(fp.WriteString(addEnd(fixSpace(p.Content()))))
+			x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content()))))
 		}
 		x(fp.Close())
 	}
 }
-
-func addEnd(s string) string {
-	s = strings.TrimSpace(s)
-	n := len(s)
-	if n == 0 {
-		return ""
-	}
-	if n > 0 {
-		if strings.ContainsAny(s[n-1:], ".!?") {
-			return s + "\n"
-		}
-	}
-	if n > 1 {
-		s2 := s[n-2:]
-		if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
-			return s + "\n"
-		}
-	}
-	return s + ".\n"
-}
-
-func fixSpace(s string) string {
-	return strings.Join(strings.Fields(s), " ")
-}
--- a/Oog/txt2corpus.sh
+++ b/Oog/txt2corpus.sh
@@ -2,17 +2,20 @@

 set -e

+BASE=/net/corpora/nlnieuws
+PART=$BASE/Oog
+
 unset CDPATH
-PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
+PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
 export TZ=Europe/Amsterdam
 . /net/aps/etc/alpino-activate.sh > /dev/null

 if [ "$1" = "" ]
 then
-    ds=`date -d -7days +%G-%V`
+    ds=`date -d -7days +%G.%V`
 else
    case "$1" in
-        2[0-9][0-9][0-9]-[0-5][0-9])
+        2[0-9][0-9][0-9].[0-5][0-9])
            ds=$1
            ;;
        *)
@@ -22,11 +25,13 @@ else
    esac
 fi

-dp=${ds//-//}
+year=${ds%.*}
+week=${ds#*.}
+dp=$year/w$week
+corpus=$PART/corpus/$year/$ds
+mkdir -p $PART/corpus/$year

-corpus=/net/corpora/nlnieuws/Oog/corpus/$ds
-
-cd /net/corpora/nlnieuws/Oog/$dp
+cd $PART/$dp

 ln -s lock.$$ lock
 if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +43,7 @@ fi
 rm -fr out
 mkdir out

-../../xml2txt $ds
+xml2txt $ds

 rm -f $corpus.lines
 for i in out/*.txt
@@ -53,14 +58,14 @@ cd out
 mkdir xml
 Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log

-../../../metadata
+metadata

 cd xml
 rm -f $corpus.data.dz $corpus.index
 alto -q -o $corpus.data.dz *.xml

 # telling per bericht, niet per zin
-/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
+query.sh -x T -s $corpus.data.dz > $corpus.tag.txt

 cd ../..
 rm -fr out
--- a/Parool/Makefile
+++ b/Parool/Makefile
@@ -5,5 +5,5 @@ all: \
 metadata: cmd/metadata/*.go
 	go build -o $@ $^

-parool: cmd/parool/*.go
-	go build -o $@ $^
+parool: cmd/parool/*.go ../internal/util/*.go
+	go build -o $@ $<
--- a/Parool/cmd/parool/parool.go
+++ b/Parool/cmd/parool/parool.go
@@ -4,13 +4,16 @@ import (
 	e "codeberg.org/pebbe/errors"
 	"github.com/jbowtie/gokogiri"

+	u "git.web.rug.nl/p209327/nlnieuws/internal/util"
+
+	//"encoding/json"
 	"encoding/xml"
 	"fmt"
+	//"html"
 	"io"
 	"net/http"
 	"net/url"
 	"os"
-	"path/filepath"
 	"strings"
 	"time"
 )
@@ -25,9 +28,16 @@ type ItemT struct {
 	UnixTime int64  `xml:"unixTime"`
 	Guid     string `xml:"guid"`
 	Link     string `xml:"link"`
+	Title    string `xml:"title"`
 	Data     []byte `xml:",innerxml"`
 }

+/*
+type GraphT struct {
+	Graph []map[string]any `json:"@graph"`
+}
+*/
+
 var (
 	p     = e.PanicErr
 	w     = e.WarnErr
@@ -62,7 +72,7 @@ func main() {
 	}()

 	myLock := "/net/corpora/nlnieuws/Parool/lock"
-	mkLock(myLock)
+	u.MkLock(myLock)
 	defer func() {
 		_ = os.Remove(myLock)
 	}()
@@ -92,7 +102,7 @@ func main() {
 		}
 		p(err)
 		year, week := t.ISOWeek()
-		dirname := fmt.Sprintf("/net/corpora/nlnieuws/Parool/%d/%02d", year, week)
+		dirname := fmt.Sprintf("/net/corpora/nlnieuws/Parool/%d/w%02d", year, week)
 		if exists(dirname + "/lock") {
 			continue
 		}
@@ -122,18 +132,19 @@ func main() {
 			p(fp.WriteString("</item>\n"))
 			p(fp.Close())
 			p(os.Chtimes(filename+".xml", t, t))
-			ok = doArticle(filename, item.Link, t, needUpdate)
+			ok = doArticle(filename, item.Link, item.Title, t, needUpdate)
 		}()
 	}
 }

-func doArticle(filename string, url string, timestamp time.Time, needUpdate bool) (ok bool) {
+func doArticle(filename string, url string, title string, timestamp time.Time, needUpdate bool) (ok bool) {
 	if exists(filename + ".skip") {
 		return true
 	}
 	if needUpdate {
 		_ = os.Remove(filename + ".err")
 		_ = os.Remove(filename + ".html")
+		// _ = os.Remove(filename + ".json")
 		_ = os.Remove(filename + ".txt")
 	} else {
 		if exists(filename + ".txt") {
@@ -153,9 +164,67 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
 	p(err)
 	p(resp.Body.Close())

+	body = u.HtmlFix(body)
+
 	doc, err := gokogiri.ParseHtml(body)
 	p(err)

+	/*
+
+		s := string(body)
+
+		ok = true
+		i1 := strings.Index(s, `<script type="application/ld+json"`)
+		if i1 < 0 {
+			ok = false
+		} else {
+			i1 += strings.Index(s[i1:], `>`) + 1
+			i2 := i1 + strings.Index(s[i1:], `</script>`)
+			if i2 < i1 {
+				ok = false
+			} else {
+				s = html.UnescapeString(s[i1:i2])
+			}
+		}
+		if !ok {
+			_ = w(fmt.Errorf("script jsonld not found: %s", url))
+
+			fp, err := os.Create(filename + ".err")
+			p(err)
+			p(fmt.Fprintf(fp, "script jsonld not found: %s\n", url))
+			p(fp.Close())
+			p(os.Chtimes(filename+".err", timestamp, timestamp))
+
+			fp, err = os.Create(filename + ".html")
+			p(err)
+			p(fp.Write(body))
+			p(fp.Close())
+			p(os.Chtimes(filename+".html", timestamp, timestamp))
+
+			return false
+		}
+
+		var graph GraphT
+		p(json.Unmarshal([]byte(s), &graph))
+		for _, g := range graph.Graph {
+			t := g["@type"]
+			switch v := t.(type) {
+			case string:
+				if v == "NewsArticle" {
+					b, err := json.Marshal(g)
+					p(err)
+					s = string(b)
+				}
+			}
+		}
+
+		fp, err := os.Create(filename + ".json")
+		p(err)
+		p(fp.WriteString(s))
+		p(fp.Close())
+		p(os.Chtimes(filename+".json", timestamp, timestamp))
+	*/
+
 	root := doc.Root()

 	articles, err := root.Search(`//article[@id="article-content"]`)
@@ -211,6 +280,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
 	}
 	header := headers[0]

+	isVideo := false
 	tags := make([]string, 0)
 	ell, err := header.Search(`.//*[@data-test-id="article-label"]`)
 	p(err)
@@ -219,25 +289,16 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
 	}
 	for _, el := range ell {
 		s := strings.TrimSpace(el.Content())
-		if s != "" {
+		if s != "" && s != "Nieuws" {
 			tags = append(tags, s)
 		}
+		if strings.ToLower(s) == "video" {
+			isVideo = true
+		}
 	}

 	pars := make([]string, 0)

-	ell, err = header.Search(`.//*[@data-test-id="article-title"]`)
-	p(err)
-	if len(ell) != 1 {
-		_ = w(fmt.Errorf("found %d titles: %s", len(ell), url))
-	}
-	for _, el := range ell {
-		s := strings.TrimSpace(el.Content())
-		if s != "" {
-			pars = append(pars, s)
-		}
-	}
-
 	found := false
 	ell, err = header.Search(`.//*[@data-test-id="header-intro"]`)
 	p(err)
@@ -252,10 +313,10 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
 		_ = w(fmt.Errorf("no intro: %s", url))
 	}

-	specials, err := article.Search(`.//section//aside | .//section//figure | .//section//b`)
+	specials, err := article.Search(`.//aside | .//figure | .//figcaption | .//section//b`)
 	p(err)
-	for _, special := range specials {
-		special.Remove()
+	for i := len(specials) - 1; i >= 0; i-- {
+		specials[i].Remove()
 	}

 	ell, err = article.Search(`.//section//*[@data-article-element-index]`)
@@ -287,7 +348,9 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
 		}
 	}
 	if !found {
-		_ = w(fmt.Errorf("no text, skipping: %s", url))
+		if !isVideo {
+			_ = w(fmt.Errorf("no text, skipping: %s", url))
+		}
 		fp, err := os.Create(filename + ".skip")
 		p(fp.WriteString(url + "\n"))
 		p(err)
@@ -309,12 +372,14 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
 		p(fmt.Fprintln(fp, "##META text tag ="))
 	} else {
 		for _, tag := range tags {
-			p(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag)))
+			p(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
 		}
 	}

+	p(fp.WriteString(u.AddEnd(u.FixSpace(title))))
+
 	for _, par := range pars {
-		p(fp.WriteString(addEnd(fixSpace(par))))
+		p(fp.WriteString(u.AddEnd(u.FixSpace(par))))
 	}

 	p(fp.Close())
@@ -323,43 +388,3 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool

 	return true
 }
-
-func addEnd(s string) string {
-	s = strings.TrimSpace(s)
-	n := len(s)
-	if n == 0 {
-		return ""
-	}
-	if n > 0 {
-		if strings.ContainsAny(s[n-1:], ".!?") {
-			return s + "\n"
-		}
-	}
-	if n > 1 {
-		s2 := s[n-2:]
-		if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
-			return s + "\n"
-		}
-	}
-	if strings.HasSuffix(s, `.”`) || strings.HasSuffix(s, `!”`) || strings.HasSuffix(s, `?”`) {
-		return s + "\n"
-	}
-	return s + ".\n"
-}
-
-func fixSpace(s string) string {
-	return strings.Join(strings.Fields(s), " ")
-}
-
-func mkLock(filename string) {
-	pid := os.Getpid()
-	link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
-	p(os.Symlink(link, filename))
-
-	name, err := os.Readlink(filename)
-	p(err)
-
-	if name != link {
-		p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
-	}
-}
--- a/Parool/txt2corpus.sh
+++ b/Parool/txt2corpus.sh
@@ -2,17 +2,20 @@

 set -e

+BASE=/net/corpora/nlnieuws
+PART=$BASE/Parool
+
 unset CDPATH
-PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
+PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
 export TZ=Europe/Amsterdam
 . /net/aps/etc/alpino-activate.sh > /dev/null

 if [ "$1" = "" ]
 then
-    ds=`date -d -7days +%G-%V`
+    ds=`date -d -7days +%G.%V`
 else
    case "$1" in
-        2[0-9][0-9][0-9]-[0-5][0-9])
+        2[0-9][0-9][0-9].[0-5][0-9])
            ds=$1
            ;;
        *)
@@ -22,11 +25,13 @@ else
    esac
 fi

-dp=${ds//-//}
+year=${ds%.*}
+week=${ds#*.}
+dp=$year/w$week
+corpus=$PART/corpus/$year/$ds
+mkdir -p $PART/corpus/$year

-corpus=/net/corpora/nlnieuws/Parool/corpus/$ds
-
-cd /net/corpora/nlnieuws/Parool/$dp
+cd $PART/$dp

 ln -s lock.$$ lock
 if [ "`readlink lock`" != lock.$$ ]
@@ -51,14 +56,14 @@ cd out
 mkdir xml
 Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log

-../../../metadata
+metadata

 cd xml
 rm -f $corpus.data.dz $corpus.index
 alto -q -o $corpus.data.dz *.xml

 # telling per bericht, niet per zin
-/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
+query.sh -x T -s $corpus.data.dz > $corpus.tag.txt

 cd ../..
 rm -fr out
--- a/README.md
+++ b/README.md
@@ -0,0 +1,95 @@
+# Actuele nieuwsberichten
+
+- voor Alpino: nieuwe namen en nieuwe woorden
+- voor *Woord van de maand* ook: toplijsten van personen, plaatsen,
+  organisaties en andere namen (TODO: url van webapp)
+
+Voor interactief gebruik:
+
+```
+query.sh
+```
+
+## 1. Verzamelen van berichten
+
+Berichten van NieuwsNL in `NieuwsNL/yyyy/mm/dd/`
+
+Overigen in `[A-Z]*/yyyy/ww/` (weeknummer)
+
+crontab van p209327@colossus
+
+```
+# m h dom mon dow command
+3   * *   *   *   /net/corpora/nlnieuws/AT5/at5
+4   * *   *   *   /net/corpora/nlnieuws/BuurtAdam/buurtadam
+5   * *   *   *   /net/corpora/nlnieuws/BuurtGrn/buurtgrn
+6   * *   *   *   /net/corpora/nlnieuws/GG/gg
+7   * *   *   *   /net/corpora/nlnieuws/HLN/hln
+8   * *   *   *   /net/corpora/nlnieuws/LitNL/litnl
+9   * *   *   *   /net/corpora/nlnieuws/NieuwsNL/nieuwsnl
+10  * *   *   *   /net/corpora/nlnieuws/NOS/nos
+11  * *   *   *   /net/corpora/nlnieuws/NU/nu
+12  * *   *   *   /net/corpora/nlnieuws/Oog/oog
+13  * *   *   *   /net/corpora/nlnieuws/Parool/parool
+14  * *   *   *   /net/corpora/nlnieuws/RO/ro
+15  * *   *   *   /net/corpora/nlnieuws/RTVNoord/rtvnoord
+16  * *   *   *   /net/corpora/nlnieuws/Sargasso/sargasso
+17  * *   *   *   /net/corpora/nlnieuws/Sikkom/sikkom
+18  * *   *   *   /net/corpora/nlnieuws/Tzum/tzum
+19  * *   *   *   /net/corpora/nlnieuws/VRT/vrt
+20  * *   *   *   /net/corpora/nlnieuws/Volkskrant/volkskrant
+```
+
+## 2. Teksten verwerken: omzetten naar zinnen, parsen, metadata toevoegen
+
+Uitvoer in `[A-Z]*/corpus/`
+
+crontab van p209327@colossus
+
+```
+# m h dom mon dow command
+# veel data: elke dag
+0   1 *   *   *   /net/corpora/nlnieuws/HLN/txt2corpus.sh
+0   1 *   *   *   /net/corpora/nlnieuws/NOS/txt2corpus.sh
+0   1 *   *   *   /net/corpora/nlnieuws/NU/txt2corpus.sh
+0   1 *   *   *   /net/corpora/nlnieuws/NieuwsNL/txt2corpus.sh
+0   1 *   *   *   /net/corpora/nlnieuws/VRT/txt2corpus.sh
+0   1 *   *   *   /net/corpora/nlnieuws/Volkskrant/txt2corpus.sh
+# weinig data: alleen op dinsdag
+0   1 *   *   2   /net/corpora/nlnieuws/AT5/txt2corpus.sh
+0   1 *   *   2   /net/corpora/nlnieuws/BuurtAdam/txt2corpus.sh
+0   1 *   *   2   /net/corpora/nlnieuws/BuurtGrn/txt2corpus.sh
+0   1 *   *   2   /net/corpora/nlnieuws/GG/txt2corpus.sh
+0   1 *   *   2   /net/corpora/nlnieuws/LitNL/txt2corpus.sh
+0   1 *   *   2   /net/corpora/nlnieuws/Oog/txt2corpus.sh
+0   1 *   *   2   /net/corpora/nlnieuws/Parool/txt2corpus.sh
+0   1 *   *   2   /net/corpora/nlnieuws/RO/txt2corpus.sh
+0   1 *   *   2   /net/corpora/nlnieuws/RTVNoord/txt2corpus.sh
+0   1 *   *   2   /net/corpora/nlnieuws/Sargasso/txt2corpus.sh
+0   1 *   *   2   /net/corpora/nlnieuws/Sikkom/txt2corpus.sh
+0   1 *   *   2   /net/corpora/nlnieuws/Tzum/txt2corpus.sh
+```
+
+## 3. Queries uitvoeren, tellingen doen
+
+Tellingen in `data/`
+
+Gegevens voor webapp in `data/json/`
+
+Op woensdag
+
+crontab van p209327@colossus
+
+```
+# m h dom mon dow command
+0   1 *   *   3   /net/corpora/nlnieuws/collect.sh
+```
+
+## 4. Data in json op webplatform zetten
+
+crontab van f109308@colossus
+
+```
+# m h      dom mon dow command
+30  0-23/4 *   *   *   rsync -e 'ssh -F /net/aistaff/alfa/.ssh/config' -a --no-g /net/corpora/nlnieuws/data/json/ webalfa:/home/www/f109308/site/wvdm/data
+```
--- a/RO/Makefile
+++ b/RO/Makefile
@@ -3,11 +3,11 @@ all: \
 	metadata \
 	ro

-xml2txt: cmd/xml2txt/*.go
-	go build -o $@ $^
+xml2txt: cmd/xml2txt/*.go ../internal/util/*.go
+	go build -o $@ $<

 metadata: cmd/metadata/*.go
 	go build -o $@ $^

-ro: cmd/ro/*.go
-	go build -o $@ $^
+ro: cmd/ro/*.go ../internal/util/*.go
+	go build -o $@ $<
--- a/RO/cmd/ro/ro.go
+++ b/RO/cmd/ro/ro.go
@@ -3,13 +3,14 @@ package main
 import (
 	e "codeberg.org/pebbe/errors"

+	u "git.web.rug.nl/p209327/nlnieuws/internal/util"
+
 	"encoding/xml"
 	"fmt"
 	"io"
 	"net/http"
 	"net/url"
 	"os"
-	"path/filepath"
 	"strings"
 	"time"
 )
@@ -28,8 +29,9 @@ type ItemT struct {
 }

 var (
-	p     = e.PanicErr
-	agent = "AhrefsBot/7.0"
+	p = e.PanicErr
+	//agent = "AhrefsBot/7.0"
+	agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/149.0.0.0 Safari/537.36"
 )

 func exists(filename string) bool {
@@ -46,7 +48,7 @@ func main() {
 	}()

 	myLock := "/net/corpora/nlnieuws/RO/lock"
-	mkLock(myLock)
+	u.MkLock(myLock)
 	defer func() {
 		_ = os.Remove(myLock)
 	}()
@@ -76,7 +78,7 @@ func main() {
 		}
 		p(err)
 		year, week := t.ISOWeek()
-		dirname := fmt.Sprintf("/net/corpora/nlnieuws/RO/%d/%02d", year, week)
+		dirname := fmt.Sprintf("/net/corpora/nlnieuws/RO/%d/w%02d", year, week)
 		if exists(dirname + "/lock") {
 			continue
 		}
@@ -111,16 +113,3 @@ func main() {
 	}

 }
-
-func mkLock(filename string) {
-	pid := os.Getpid()
-	link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
-	p(os.Symlink(link, filename))
-
-	name, err := os.Readlink(filename)
-	p(err)
-
-	if name != link {
-		p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
-	}
-}
--- a/RO/cmd/xml2txt/xml2txt.go
+++ b/RO/cmd/xml2txt/xml2txt.go
@@ -5,6 +5,8 @@ import (
 	"github.com/jbowtie/gokogiri"
 	"github.com/pebbe/textcat/v2"

+	u "git.web.rug.nl/p209327/nlnieuws/internal/util"
+
 	"bytes"
 	"encoding/xml"
 	"fmt"
@@ -24,7 +26,7 @@ var (
 	x = e.ExitErr
 	w = e.WarnErr

-	reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
+	reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
 )

 func main() {
@@ -36,16 +38,16 @@ func main() {
 	switch len(os.Args) {
 	case 1:
 		year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
-		ds = fmt.Sprintf("%d-%02d", year, week)
+		ds = fmt.Sprintf("%d.%02d", year, week)
 	case 2:
 		if !reYearWeek.MatchString(os.Args[1]) {
-			x(fmt.Errorf("arg must be yyyy-ww"))
+			x(fmt.Errorf("arg must be yyyy.ww"))
 		}
 		ds = os.Args[1]
 	default:
 		x(fmt.Errorf("too many arguments"))
 	}
-	dp := ds[:4] + "/" + ds[5:]
+	dp := ds[:4] + "/w" + ds[5:]

 	x(os.Chdir("/net/corpora/nlnieuws/RO/" + dp))
 	x(os.MkdirAll("out", 0777))
@@ -61,19 +63,19 @@ func main() {
 		var buf bytes.Buffer
 		var item Item
 		x(xml.Unmarshal(b, &item))
-		x(buf.WriteString(addEnd(fixSpace(item.Title))))
-		doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
+		x(buf.WriteString(u.AddEnd(u.FixSpace(item.Title))))
+		doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
 		x(err)
 		root := doc.Root()
 		divs, err := root.Search(`//div[@class="donatieformlinks"]`)
 		x(err)
-		for _, div := range divs {
-			div.Remove()
+		for i := len(divs) - 1; i >= 0; i-- {
+			divs[i].Remove()
 		}
 		pp, err := root.Search(`//body//p[not(.//a[contains(@href,"reportersonline.nl/support")])]`)
 		x(err)
 		for _, p := range pp {
-			x(buf.WriteString(addEnd(fixSpace(p.Content()))))
+			x(buf.WriteString(u.AddEnd(u.FixSpace(p.Content()))))
 		}

 		text := buf.String()
@@ -90,33 +92,12 @@ func main() {
 		fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt")
 		x(err)
 		for _, cat := range item.Cats {
-			x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat)))
+			t := u.FixSpace(cat)
+			if t != "Artikelen" && t != "cafeyn" {
+				x(fmt.Fprintf(fp, "##META text tag = %s\n", t))
+			}
 		}
 		x(fp.WriteString(text))
 		x(fp.Close())
 	}
 }
-
-func addEnd(s string) string {
-	s = strings.TrimSpace(s)
-	n := len(s)
-	if n == 0 {
-		return ""
-	}
-	if n > 0 {
-		if strings.ContainsAny(s[n-1:], ".!?") {
-			return s + "\n"
-		}
-	}
-	if n > 1 {
-		s2 := s[n-2:]
-		if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
-			return s + "\n"
-		}
-	}
-	return s + ".\n"
-}
-
-func fixSpace(s string) string {
-	return strings.Join(strings.Fields(s), " ")
-}
--- a/RO/txt2corpus.sh
+++ b/RO/txt2corpus.sh
@@ -2,17 +2,20 @@

 set -e

+BASE=/net/corpora/nlnieuws
+PART=$BASE/RO
+
 unset CDPATH
-PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
+PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
 export TZ=Europe/Amsterdam
 . /net/aps/etc/alpino-activate.sh > /dev/null

 if [ "$1" = "" ]
 then
-    ds=`date -d -7days +%G-%V`
+    ds=`date -d -7days +%G.%V`
 else
    case "$1" in
-        2[0-9][0-9][0-9]-[0-5][0-9])
+        2[0-9][0-9][0-9].[0-5][0-9])
            ds=$1
            ;;
        *)
@@ -22,11 +25,13 @@ else
    esac
 fi

-dp=${ds//-//}
+year=${ds%.*}
+week=${ds#*.}
+dp=$year/w$week
+corpus=$PART/corpus/$year/$ds
+mkdir -p $PART/corpus/$year

-corpus=/net/corpora/nlnieuws/RO/corpus/$ds
-
-cd /net/corpora/nlnieuws/RO/$dp
+cd $PART/$dp

 ln -s lock.$$ lock
 if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +43,7 @@ fi
 rm -fr out
 mkdir out

-../../xml2txt $ds
+xml2txt $ds

 rm -f $corpus.lines
 for i in out/*.txt
@@ -53,14 +58,14 @@ cd out
 mkdir xml
 Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log

-../../../metadata
+metadata

 cd xml
 rm -f $corpus.data.dz $corpus.index
 alto -q -o $corpus.data.dz *.xml

 # telling per bericht, niet per zin
-/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
+query.sh -x T -s $corpus.data.dz > $corpus.tag.txt

 cd ../..
 rm -fr out
--- a/RTVNoord/Makefile
+++ b/RTVNoord/Makefile
@@ -5,5 +5,5 @@ all: \
 metadata: cmd/metadata/*.go
 	go build -o $@ $^

-rtvnoord: cmd/rtvnoord/*.go
-	go build -o $@ $^
+rtvnoord: cmd/rtvnoord/*.go ../internal/util/*.go
+	go build -o $@ $<
--- a/RTVNoord/cmd/rtvnoord/rtvnoord.go
+++ b/RTVNoord/cmd/rtvnoord/rtvnoord.go
@@ -3,6 +3,8 @@ package main
 import (
 	e "codeberg.org/pebbe/errors"

+	u "git.web.rug.nl/p209327/nlnieuws/internal/util"
+
 	"encoding/json"
 	"encoding/xml"
 	"fmt"
@@ -10,7 +12,6 @@ import (
 	"io"
 	"net/http"
 	"os"
-	"path/filepath"
 	"strings"
 	"time"
 )
@@ -75,7 +76,7 @@ func main() {
 	}()

 	myLock := "/net/corpora/nlnieuws/RTVNoord/lock"
-	mkLock(myLock)
+	u.MkLock(myLock)
 	defer func() {
 		_ = os.Remove(myLock)
 	}()
@@ -100,7 +101,7 @@ func main() {
 		}
 		p(err)
 		year, week := t.ISOWeek()
-		dirname := fmt.Sprintf("/net/corpora/nlnieuws/RTVNoord/%d/%02d", year, week)
+		dirname := fmt.Sprintf("/net/corpora/nlnieuws/RTVNoord/%d/w%02d", year, week)
 		if exists(dirname + "/lock") {
 			continue
 		}
@@ -225,7 +226,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
 	// text bevat kopjes zonder punt aan het eind
 	lines := strings.Split(doc.Text, "\n")
 	for i, line := range lines {
-		lines[i] = addEnd(fixSpace(line))
+		lines[i] = u.AddEnd(u.FixSpace(line, true))
 	}
 	text := strings.Join(lines, "") + "\n"

@@ -235,16 +236,21 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
 		p(fmt.Fprintln(fp, "##META text tag ="))
 	} else {
 		for _, tag := range doc.Tags {
-			p(fmt.Fprintf(fp, "##META text tag = %s\n", strings.ToLower(fixSpace(tag))))
+			t := strings.ToLower(u.FixSpace(tag))
+			if strings.HasPrefix(t, "br_") {
+				continue
+			}
+			t = strings.TrimPrefix(t, "tr_")
+			p(fmt.Fprintf(fp, "##META text tag = %s\n", t))
 		}
 	}
 	if doc.Cat == "" {
 		p(fmt.Fprintln(fp, "##META text cat ="))
 	} else {
-		p(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(doc.Cat)))
+		p(fmt.Fprintf(fp, "##META text cat = %s\n", u.FixSpace(doc.Cat)))
 	}

-	p(fp.WriteString(addEnd(doc.Title)))
+	p(fp.WriteString(u.AddEnd(doc.Title)))

 	p(fp.WriteString(text))
 	p(fp.Close())
@@ -252,40 +258,3 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool

 	return true
 }
-
-func addEnd(s string) string {
-	s = strings.TrimSpace(s)
-	n := len(s)
-	if n == 0 {
-		return ""
-	}
-	if n > 0 {
-		if strings.ContainsAny(s[n-1:], ".!?") {
-			return s + "\n"
-		}
-	}
-	if n > 1 {
-		s2 := s[n-2:]
-		if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
-			return s + "\n"
-		}
-	}
-	return s + ".\n"
-}
-
-func fixSpace(s string) string {
-	return strings.Join(strings.Fields(s), " ")
-}
-
-func mkLock(filename string) {
-	pid := os.Getpid()
-	link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
-	p(os.Symlink(link, filename))
-
-	name, err := os.Readlink(filename)
-	p(err)
-
-	if name != link {
-		p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
-	}
-}
--- a/RTVNoord/txt2corpus.sh
+++ b/RTVNoord/txt2corpus.sh
@@ -2,17 +2,20 @@

 set -e

+BASE=/net/corpora/nlnieuws
+PART=$BASE/RTVNoord
+
 unset CDPATH
-PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
+PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
 export TZ=Europe/Amsterdam
 . /net/aps/etc/alpino-activate.sh > /dev/null

 if [ "$1" = "" ]
 then
-    ds=`date -d -7days +%G-%V`
+    ds=`date -d -7days +%G.%V`
 else
    case "$1" in
-        2[0-9][0-9][0-9]-[0-5][0-9])
+        2[0-9][0-9][0-9].[0-5][0-9])
            ds=$1
            ;;
        *)
@@ -22,11 +25,13 @@ else
    esac
 fi

-dp=${ds//-//}
+year=${ds%.*}
+week=${ds#*.}
+dp=$year/w$week
+corpus=$PART/corpus/$year/$ds
+mkdir -p $PART/corpus/$year

-corpus=/net/corpora/nlnieuws/RTVNoord/corpus/$ds
-
-cd /net/corpora/nlnieuws/RTVNoord/$dp
+cd $PART/$dp

 ln -s lock.$$ lock
 if [ "`readlink lock`" != lock.$$ ]
@@ -51,15 +56,15 @@ cd out
 mkdir xml
 Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log

-../../../metadata
+metadata

 cd xml
 rm -f $corpus.data.dz $corpus.index
 alto -q -o $corpus.data.dz *.xml

 # telling per bericht, niet per zin
-/net/corpora/nlnieuws/namen.sh -x C -s $corpus.data.dz > $corpus.cat.txt
-/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
+query.sh -x C -s $corpus.data.dz > $corpus.cat.txt
+query.sh -x T -s $corpus.data.dz > $corpus.tag.txt

 cd ../..
 rm -fr out
--- a/Sargasso/Makefile
+++ b/Sargasso/Makefile
@@ -3,11 +3,11 @@ all: \
 	metadata \
 	sargasso

-xml2txt: cmd/xml2txt/*.go
-	go build -o $@ $^
+xml2txt: cmd/xml2txt/*.go ../internal/util/*.go
+	go build -o $@ $<

 metadata: cmd/metadata/*.go
 	go build -o $@ $^

-sargasso: cmd/sargasso/*.go
-	go build -o $@ $^
+sargasso: cmd/sargasso/*.go ../internal/util/*.go
+	go build -o $@ $<
--- a/Sargasso/cmd/sargasso/sargasso.go
+++ b/Sargasso/cmd/sargasso/sargasso.go
@@ -3,13 +3,14 @@ package main
 import (
 	e "codeberg.org/pebbe/errors"

+	u "git.web.rug.nl/p209327/nlnieuws/internal/util"
+
 	"encoding/xml"
 	"fmt"
 	"io"
 	"net/http"
 	"net/url"
 	"os"
-	"path/filepath"
 	"strings"
 	"time"
 )
@@ -46,7 +47,7 @@ func main() {
 	}()

 	myLock := "/net/corpora/nlnieuws/Sargasso/lock"
-	mkLock(myLock)
+	u.MkLock(myLock)
 	defer func() {
 		_ = os.Remove(myLock)
 	}()
@@ -76,7 +77,7 @@ func main() {
 		}
 		p(err)
 		year, week := t.ISOWeek()
-		dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sargasso/%d/%02d", year, week)
+		dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sargasso/%d/w%02d", year, week)
 		if exists(dirname + "/lock") {
 			continue
 		}
@@ -111,16 +112,3 @@ func main() {
 	}

 }
-
-func mkLock(filename string) {
-	pid := os.Getpid()
-	link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
-	p(os.Symlink(link, filename))
-
-	name, err := os.Readlink(filename)
-	p(err)
-
-	if name != link {
-		p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
-	}
-}
--- a/Sargasso/cmd/xml2txt/xml2txt.go
+++ b/Sargasso/cmd/xml2txt/xml2txt.go
@@ -4,6 +4,8 @@ import (
 	e "codeberg.org/pebbe/errors"
 	"github.com/jbowtie/gokogiri"

+	u "git.web.rug.nl/p209327/nlnieuws/internal/util"
+
 	"encoding/xml"
 	"fmt"
 	"os"
@@ -21,7 +23,7 @@ type Item struct {
 var (
 	x = e.ExitErr

-	reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
+	reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
 )

 func main() {
@@ -30,16 +32,16 @@ func main() {
 	switch len(os.Args) {
 	case 1:
 		year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
-		ds = fmt.Sprintf("%d-%02d", year, week)
+		ds = fmt.Sprintf("%d.%02d", year, week)
 	case 2:
 		if !reYearWeek.MatchString(os.Args[1]) {
-			x(fmt.Errorf("arg must be yyyy-ww"))
+			x(fmt.Errorf("arg must be yyyy.ww"))
 		}
 		ds = os.Args[1]
 	default:
 		x(fmt.Errorf("too many arguments"))
 	}
-	dp := ds[:4] + "/" + ds[5:]
+	dp := ds[:4] + "/w" + ds[5:]

 	x(os.Chdir("/net/corpora/nlnieuws/Sargasso/" + dp))
 	x(os.MkdirAll("out", 0777))
@@ -57,41 +59,17 @@ func main() {
 		var item Item
 		x(xml.Unmarshal(b, &item))
 		for _, cat := range item.Cats {
-			x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat)))
+			x(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(cat)))
 		}
-		x(fp.WriteString(addEnd(fixSpace(item.Title))))
-		doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
+		x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
+		doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
 		x(err)
 		root := doc.Root()
 		pp, err := root.Search(`//body//p`)
 		x(err)
 		for _, p := range pp {
-			x(fp.WriteString(addEnd(fixSpace(p.Content()))))
+			x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content()))))
 		}
 		x(fp.Close())
 	}
 }
-
-func addEnd(s string) string {
-	s = strings.TrimSpace(s)
-	n := len(s)
-	if n == 0 {
-		return ""
-	}
-	if n > 0 {
-		if strings.ContainsAny(s[n-1:], ".!?") {
-			return s + "\n"
-		}
-	}
-	if n > 1 {
-		s2 := s[n-2:]
-		if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
-			return s + "\n"
-		}
-	}
-	return s + ".\n"
-}
-
-func fixSpace(s string) string {
-	return strings.Join(strings.Fields(s), " ")
-}
--- a/Sargasso/txt2corpus.sh
+++ b/Sargasso/txt2corpus.sh
@@ -2,17 +2,20 @@

 set -e

+BASE=/net/corpora/nlnieuws
+PART=$BASE/Sargasso
+
 unset CDPATH
-PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
+PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
 export TZ=Europe/Amsterdam
 . /net/aps/etc/alpino-activate.sh > /dev/null

 if [ "$1" = "" ]
 then
-    ds=`date -d -7days +%G-%V`
+    ds=`date -d -7days +%G.%V`
 else
    case "$1" in
-        2[0-9][0-9][0-9]-[0-5][0-9])
+        2[0-9][0-9][0-9].[0-5][0-9])
            ds=$1
            ;;
        *)
@@ -22,11 +25,13 @@ else
    esac
 fi

-dp=${ds//-//}
+year=${ds%.*}
+week=${ds#*.}
+dp=$year/w$week
+corpus=$PART/corpus/$year/$ds
+mkdir -p $PART/corpus/$year

-corpus=/net/corpora/nlnieuws/Sargasso/corpus/$ds
-
-cd /net/corpora/nlnieuws/Sargasso/$dp
+cd $PART/$dp

 ln -s lock.$$ lock
 if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +43,7 @@ fi
 rm -fr out
 mkdir out

-../../xml2txt $ds
+xml2txt $ds

 rm -f $corpus.lines
 for i in out/*.txt
@@ -53,14 +58,14 @@ cd out
 mkdir xml
 Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log

-../../../metadata
+metadata

 cd xml
 rm -f $corpus.data.dz $corpus.index
 alto -q -o $corpus.data.dz *.xml

 # telling per bericht, niet per zin
-/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
+query.sh -x T -s $corpus.data.dz > $corpus.tag.txt

 cd ../..
 rm -fr out
--- a/Sikkom/Makefile
+++ b/Sikkom/Makefile
@@ -5,5 +5,5 @@ all: \
 metadata: cmd/metadata/*.go
 	go build -o $@ $^

-sikkom: cmd/sikkom/*.go
-	go build -o $@ $^
+sikkom: cmd/sikkom/*.go ../internal/util/*.go
+	go build -o $@ $<
--- a/Sikkom/cmd/sikkom/sikkom.go
+++ b/Sikkom/cmd/sikkom/sikkom.go
@@ -4,6 +4,8 @@ import (
 	e "codeberg.org/pebbe/errors"
 	"github.com/jbowtie/gokogiri"

+	u "git.web.rug.nl/p209327/nlnieuws/internal/util"
+
 	"encoding/xml"
 	"fmt"
 	"html"
@@ -11,7 +13,6 @@ import (
 	"net/http"
 	"net/url"
 	"os"
-	"path/filepath"
 	"strings"
 	"time"
 )
@@ -64,7 +65,7 @@ func main() {
 	}()

 	myLock := "/net/corpora/nlnieuws/Sikkom/lock"
-	mkLock(myLock)
+	u.MkLock(myLock)
 	defer func() {
 		_ = os.Remove(myLock)
 	}()
@@ -89,7 +90,7 @@ func main() {
 		}
 		p(err)
 		year, week := t.ISOWeek()
-		dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sikkom/%d/%02d", year, week)
+		dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sikkom/%d/w%02d", year, week)
 		if exists(dirname + "/lock") {
 			continue
 		}
@@ -151,6 +152,8 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
 	p(err)
 	p(resp.Body.Close())

+	body = u.HtmlFix(body)
+
 	s := string(body)

 	ok := true
@@ -220,49 +223,12 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
 	fp, err = os.Create(filename + ".txt")
 	p(err)

-	p(fp.WriteString(addEnd(fixSpace(title))))
+	p(fp.WriteString(u.AddEnd(u.FixSpace(title))))

 	for _, p1 := range pp {
-		p(fp.WriteString(addEnd(fixSpace(p1.Content()))))
+		p(fp.WriteString(u.AddEnd(u.FixSpace(p1.Content()))))
 	}

 	p(fp.Close())
 	return true
 }
-
-func addEnd(s string) string {
-	s = strings.TrimSpace(s)
-	n := len(s)
-	if n == 0 {
-		return ""
-	}
-	if n > 0 {
-		if strings.ContainsAny(s[n-1:], ".!?") {
-			return s + "\n"
-		}
-	}
-	if n > 1 {
-		s2 := s[n-2:]
-		if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
-			return s + "\n"
-		}
-	}
-	return s + ".\n"
-}
-
-func fixSpace(s string) string {
-	return strings.Join(strings.Fields(s), " ")
-}
-
-func mkLock(filename string) {
-	pid := os.Getpid()
-	link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
-	p(os.Symlink(link, filename))
-
-	name, err := os.Readlink(filename)
-	p(err)
-
-	if name != link {
-		p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
-	}
-}
--- a/Sikkom/txt2corpus.sh
+++ b/Sikkom/txt2corpus.sh
@@ -2,17 +2,20 @@

 set -e

+BASE=/net/corpora/nlnieuws
+PART=$BASE/Sikkom
+
 unset CDPATH
-PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
+PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
 export TZ=Europe/Amsterdam
 . /net/aps/etc/alpino-activate.sh > /dev/null

 if [ "$1" = "" ]
 then
-    ds=`date -d -7days +%G-%V`
+    ds=`date -d -7days +%G.%V`
 else
    case "$1" in
-        2[0-9][0-9][0-9]-[0-5][0-9])
+        2[0-9][0-9][0-9].[0-5][0-9])
            ds=$1
            ;;
        *)
@@ -22,11 +25,13 @@ else
    esac
 fi

-dp=${ds//-//}
+year=${ds%.*}
+week=${ds#*.}
+dp=$year/w$week
+corpus=$PART/corpus/$year/$ds
+mkdir -p $PART/corpus/$year

-corpus=/net/corpora/nlnieuws/Sikkom/corpus/$ds
-
-cd /net/corpora/nlnieuws/Sikkom/$dp
+cd $PART/$dp

 ln -s lock.$$ lock
 if [ "`readlink lock`" != lock.$$ ]
@@ -51,7 +56,7 @@ cd out
 mkdir xml
 Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log

-../../../metadata
+metadata

 cd xml
 rm -f $corpus.data.dz $corpus.index
--- a/Tzum/Makefile
+++ b/Tzum/Makefile
@@ -3,11 +3,11 @@ all: \
 	metadata \
 	tzum

-xml2txt: cmd/xml2txt/*.go
-	go build -o $@ $^
+xml2txt: cmd/xml2txt/*.go ../internal/util/*.go
+	go build -o $@ $<

 metadata: cmd/metadata/*.go
 	go build -o $@ $^

-tzum: cmd/tzum/*.go
-	go build -o $@ $^
+tzum: cmd/tzum/*.go ../internal/util/*.go
+	go build -o $@ $<
--- a/Tzum/cmd/tzum/tzum.go
+++ b/Tzum/cmd/tzum/tzum.go
@@ -3,13 +3,14 @@ package main
 import (
 	e "codeberg.org/pebbe/errors"

+	u "git.web.rug.nl/p209327/nlnieuws/internal/util"
+
 	"encoding/xml"
 	"fmt"
 	"io"
 	"net/http"
 	"net/url"
 	"os"
-	"path/filepath"
 	"strings"
 	"time"
 )
@@ -46,7 +47,7 @@ func main() {
 	}()

 	myLock := "/net/corpora/nlnieuws/Tzum/lock"
-	mkLock(myLock)
+	u.MkLock(myLock)
 	defer func() {
 		_ = os.Remove(myLock)
 	}()
@@ -76,7 +77,7 @@ func main() {
 		}
 		p(err)
 		year, week := t.ISOWeek()
-		dirname := fmt.Sprintf("/net/corpora/nlnieuws/Tzum/%d/%02d", year, week)
+		dirname := fmt.Sprintf("/net/corpora/nlnieuws/Tzum/%d/w%02d", year, week)
 		if exists(dirname + "/lock") {
 			continue
 		}
@@ -111,16 +112,3 @@ func main() {
 	}

 }
-
-func mkLock(filename string) {
-	pid := os.Getpid()
-	link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
-	p(os.Symlink(link, filename))
-
-	name, err := os.Readlink(filename)
-	p(err)
-
-	if name != link {
-		p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
-	}
-}
--- a/Tzum/cmd/xml2txt/xml2txt.go
+++ b/Tzum/cmd/xml2txt/xml2txt.go
@@ -4,6 +4,8 @@ import (
 	e "codeberg.org/pebbe/errors"
 	"github.com/jbowtie/gokogiri"

+	u "git.web.rug.nl/p209327/nlnieuws/internal/util"
+
 	"encoding/xml"
 	"fmt"
 	"os"
@@ -21,7 +23,7 @@ type Item struct {
 var (
 	x = e.ExitErr

-	reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
+	reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]\.[0-5][0-9]$`)
 )

 func main() {
@@ -30,16 +32,16 @@ func main() {
 	switch len(os.Args) {
 	case 1:
 		year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
-		ds = fmt.Sprintf("%d-%02d", year, week)
+		ds = fmt.Sprintf("%d.%02d", year, week)
 	case 2:
 		if !reYearWeek.MatchString(os.Args[1]) {
-			x(fmt.Errorf("arg must be yyyy-ww"))
+			x(fmt.Errorf("arg must be yyyy.ww"))
 		}
 		ds = os.Args[1]
 	default:
 		x(fmt.Errorf("too many arguments"))
 	}
-	dp := ds[:4] + "/" + ds[5:]
+	dp := ds[:4] + "/w" + ds[5:]

 	x(os.Chdir("/net/corpora/nlnieuws/Tzum/" + dp))
 	x(os.MkdirAll("out", 0777))
@@ -57,10 +59,14 @@ func main() {
 		var item Item
 		x(xml.Unmarshal(b, &item))
 		for _, cat := range item.Cats {
-			x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat)))
+			t := u.FixSpace(cat)
+			if t == "Nieuws" {
+				continue
+			}
+			x(fmt.Fprintf(fp, "##META text tag = %s\n", t))
 		}
-		x(fp.WriteString(addEnd(fixSpace(item.Title))))
-		doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))
+		x(fp.WriteString(u.AddEnd(u.FixSpace(item.Title))))
+		doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + u.HtmlFixString(item.Text) + `</body></html>`))
 		x(err)
 		root := doc.Root()
 		pp, err := root.Search(`//body/p`)
@@ -68,33 +74,9 @@ func main() {
 		for _, p := range pp {
 			s := p.Content()
 			if !strings.Contains(s, "verscheen eerst op Tzum.") {
-				x(fp.WriteString(addEnd(fixSpace(p.Content()))))
+				x(fp.WriteString(u.AddEnd(u.FixSpace(p.Content()))))
 			}
 		}
 		x(fp.Close())
 	}
 }
-
-func addEnd(s string) string {
-	s = strings.TrimSpace(s)
-	n := len(s)
-	if n == 0 {
-		return ""
-	}
-	if n > 0 {
-		if strings.ContainsAny(s[n-1:], ".!?") {
-			return s + "\n"
-		}
-	}
-	if n > 1 {
-		s2 := s[n-2:]
-		if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
-			return s + "\n"
-		}
-	}
-	return s + ".\n"
-}
-
-func fixSpace(s string) string {
-	return strings.Join(strings.Fields(s), " ")
-}
--- a/Tzum/txt2corpus.sh
+++ b/Tzum/txt2corpus.sh
@@ -2,17 +2,20 @@

 set -e

+BASE=/net/corpora/nlnieuws
+PART=$BASE/Tzum
+
 unset CDPATH
-PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
+PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
 export TZ=Europe/Amsterdam
 . /net/aps/etc/alpino-activate.sh > /dev/null

 if [ "$1" = "" ]
 then
-    ds=`date -d -7days +%G-%V`
+    ds=`date -d -7days +%G.%V`
 else
    case "$1" in
-        2[0-9][0-9][0-9]-[0-5][0-9])
+        2[0-9][0-9][0-9].[0-5][0-9])
            ds=$1
            ;;
        *)
@@ -22,11 +25,13 @@ else
    esac
 fi

-dp=${ds//-//}
+year=${ds%.*}
+week=${ds#*.}
+dp=$year/w$week
+corpus=$PART/corpus/$year/$ds
+mkdir -p $PART/corpus/$year

-corpus=/net/corpora/nlnieuws/Tzum/corpus/$ds
-
-cd /net/corpora/nlnieuws/Tzum/$dp
+cd $PART/$dp

 ln -s lock.$$ lock
 if [ "`readlink lock`" != lock.$$ ]
@@ -38,7 +43,7 @@ fi
 rm -fr out
 mkdir out

-../../xml2txt $ds
+xml2txt $ds

 rm -f $corpus.lines
 for i in out/*.txt
@@ -53,14 +58,14 @@ cd out
 mkdir xml
 Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log

-../../../metadata
+metadata

 cd xml
 rm -f $corpus.data.dz $corpus.index
 alto -q -o $corpus.data.dz *.xml

 # telling per bericht, niet per zin
-/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
+query.sh -x T -s $corpus.data.dz > $corpus.tag.txt

 cd ../..
 rm -fr out
--- a/VRT/Makefile
+++ b/VRT/Makefile
@@ -5,5 +5,5 @@ all: \
 metadata: cmd/metadata/*.go
 	go build -o $@ $^

-vrt: cmd/vrt/*.go
-	go build -o $@ $^
+vrt: cmd/vrt/*.go ../internal/util/*.go
+	go build -o $@ $<
--- a/VRT/cmd/vrt/vrt.go
+++ b/VRT/cmd/vrt/vrt.go
@@ -4,6 +4,8 @@ import (
 	e "codeberg.org/pebbe/errors"
 	"github.com/jbowtie/gokogiri"

+	u "git.web.rug.nl/p209327/nlnieuws/internal/util"
+
 	"bytes"
 	"encoding/xml"
 	"fmt"
@@ -11,7 +13,6 @@ import (
 	"net/http"
 	"net/url"
 	"os"
-	"path/filepath"
 	"strings"
 	"time"
 )
@@ -78,7 +79,7 @@ func main() {
 	}()

 	myLock := "/net/corpora/nlnieuws/VRT/lock"
-	mkLock(myLock)
+	u.MkLock(myLock)
 	defer func() {
 		_ = os.Remove(myLock)
 	}()
@@ -109,8 +110,7 @@ func main() {
 		if t2.After(t) {
 			t = t2
 		}
-		year, week := t.ISOWeek()
-		dirname := fmt.Sprintf("/net/corpora/nlnieuws/VRT/%d/%02d", year, week)
+		dirname := fmt.Sprintf("/net/corpora/nlnieuws/VRT/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
 		if exists(dirname + "/lock") {
 			continue
 		}
@@ -179,6 +179,8 @@ func doArticle(filename string, url string, title string, tags []string, cats []
 	p(err)
 	p(resp.Body.Close())

+	body = u.HtmlFix(body)
+
 	/*
 		s := string(body)
 		ok := true
@@ -242,18 +244,18 @@ func doArticle(filename string, url string, title string, tags []string, cats []
 		p(fmt.Fprintln(&buf, "##META text cat ="))
 	} else {
 		for _, cat := range cats {
-			p(fmt.Fprintf(&buf, "##META text cat = %s\n", fixSpace(cat)))
+			p(fmt.Fprintf(&buf, "##META text cat = %s\n", u.FixSpace(cat)))
 		}
 	}
 	if len(tags) == 0 {
 		p(fmt.Fprintln(&buf, "##META text tag ="))
 	} else {
 		for _, tag := range tags {
-			p(fmt.Fprintf(&buf, "##META text tag = %s\n", fixSpace(tag)))
+			p(fmt.Fprintf(&buf, "##META text tag = %s\n", u.FixSpace(tag)))
 		}
 	}

-	_, err = buf.WriteString(addEnd(fixSpace(title)))
+	_, err = buf.WriteString(u.AddEnd(u.FixSpace(title)))
 	p(err)

 	fouten := make([]string, 0)
@@ -262,7 +264,7 @@ func doArticle(filename string, url string, title string, tags []string, cats []
 	pp, err := root.Search(`//div[@data-sentry-component="ArticleHeading"]//*[contains(@class,"prose-article-body-r")]`)
 	p(err)
 	for _, p1 := range pp {
-		p(fmt.Fprint(&buf, addEnd(fixSpace(p1.Content()))))
+		p(fmt.Fprint(&buf, u.AddEnd(u.FixSpace(p1.Content()))))
 		found = true
 	}
 	if !found {
@@ -277,7 +279,7 @@ func doArticle(filename string, url string, title string, tags []string, cats []
 			`//div[@data-sentry-component="ArticleTitle"]//h2`)
 	p(err)
 	for _, p1 := range pp {
-		p(fmt.Fprint(&buf, addEnd(fixSpace(p1.Content()))))
+		p(fmt.Fprint(&buf, u.AddEnd(u.FixSpace(p1.Content()))))
 		found = true
 	}
 	if !found {
@@ -311,40 +313,3 @@ func doArticle(filename string, url string, title string, tags []string, cats []

 	return true
 }
-
-func addEnd(s string) string {
-	s = strings.TrimSpace(s)
-	n := len(s)
-	if n == 0 {
-		return ""
-	}
-	if n > 0 {
-		if strings.ContainsAny(s[n-1:], ".!?") {
-			return s + "\n"
-		}
-	}
-	if n > 1 {
-		s2 := s[n-2:]
-		if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
-			return s + "\n"
-		}
-	}
-	return s + ".\n"
-}
-
-func fixSpace(s string) string {
-	return strings.Join(strings.Fields(s), " ")
-}
-
-func mkLock(filename string) {
-	pid := os.Getpid()
-	link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
-	p(os.Symlink(link, filename))
-
-	name, err := os.Readlink(filename)
-	p(err)
-
-	if name != link {
-		p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
-	}
-}
--- a/VRT/txt2corpus.sh
+++ b/VRT/txt2corpus.sh
@@ -2,17 +2,20 @@

 set -e

+BASE=/net/corpora/nlnieuws
+PART=$BASE/VRT
+
 unset CDPATH
-PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
+PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
 export TZ=Europe/Amsterdam
 . /net/aps/etc/alpino-activate.sh > /dev/null

 if [ "$1" = "" ]
 then
-    ds=`date -d -7days +%G-%V`
+    ds=`date -d -2days +%Y-%m-%d`
 else
    case "$1" in
-        2[0-9][0-9][0-9]-[0-5][0-9])
+        2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
            ds=$1
            ;;
        *)
@@ -23,10 +26,11 @@ else
 fi

 dp=${ds//-//}
+year=${ds%%-*}
+corpus=$PART/corpus/$year/$ds
+mkdir -p $PART/corpus/$year

-corpus=/net/corpora/nlnieuws/VRT/corpus/$ds
-
-cd /net/corpora/nlnieuws/VRT/$dp
+cd $PART/$dp

 ln -s lock.$$ lock
 if [ "`readlink lock`" != lock.$$ ]
@@ -51,15 +55,15 @@ cd out
 mkdir xml
 Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log

-../../../metadata
+metadata

 cd xml
 rm -f $corpus.data.dz $corpus.index
 alto -q -o $corpus.data.dz *.xml

 # telling per bericht, niet per zin
-/net/corpora/nlnieuws/namen.sh -x C -s $corpus.data.dz > $corpus.cat.txt
-/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
+query.sh -x C -s $corpus.data.dz > $corpus.cat.txt
+query.sh -x T -s $corpus.data.dz > $corpus.tag.txt

 cd ../..
 rm -fr out
--- a/Volkskrant/Makefile
+++ b/Volkskrant/Makefile
@@ -0,0 +1,9 @@
+all: \
+	metadata \
+	volkskrant
+
+metadata: cmd/metadata/*.go
+	go build -o $@ $^
+
+volkskrant: cmd/volkskrant/*.go ../internal/util/*.go
+	go build -o $@ $<
--- a/Volkskrant/cmd/metadata/metadata.go
+++ b/Volkskrant/cmd/metadata/metadata.go
@@ -0,0 +1,131 @@
+package main
+
+import (
+	e "codeberg.org/pebbe/errors"
+
+	"bufio"
+	"encoding/xml"
+	"fmt"
+	"html"
+	"os"
+	"strings"
+	"time"
+)
+
+type Item struct {
+	XMLName  xml.Name `xml:"item"`
+	UnixTime int64    `xml:"unixTime"`
+}
+
+var (
+	x        = e.ExitErr
+	escape   = html.EscapeString
+	data     = make(map[string][]string)
+	location *time.Location
+)
+
+func main() {
+	var err error
+	location, err = time.LoadLocation("Europe/Amsterdam")
+	x(err)
+
+	files, err := os.ReadDir(".")
+	x(err)
+	for _, file := range files {
+		filename := file.Name()
+		if strings.HasSuffix(filename, ".txt") {
+			doText("", filename)
+		} else if strings.HasSuffix(filename, ".xml") {
+			doXml("", filename)
+		}
+	}
+	files, err = os.ReadDir("..")
+	x(err)
+	for _, file := range files {
+		filename := file.Name()
+		if strings.HasSuffix(filename, ".txt") {
+			doText("../", filename)
+		} else if strings.HasSuffix(filename, ".xml") {
+			doXml("../", filename)
+		}
+	}
+
+	files, err = os.ReadDir("xml")
+	x(err)
+	for _, file := range files {
+		filename := file.Name()
+		if !strings.HasSuffix(filename, ".xml") {
+			continue
+		}
+		aa := strings.Split(filename, ".")
+		base := strings.Join(aa[1:len(aa)-2], ".")
+		b, err := os.ReadFile("xml/" + filename)
+		x(err)
+		s := string(b)
+		i := strings.Index(s, "<alpino") + 1
+		i += strings.Index(s[i:], "<")
+		fp, err := os.Create("xml/" + filename + ".tmp")
+		x(err)
+		x(fp.WriteString(s[:i]))
+		x(fp.WriteString("<metadata>\n    <meta type=\"text\" name=\"source\" value=\"Volkskrant\"/>\n"))
+		for _, m := range data[base] {
+			x(fp.WriteString("    " + m + "\n"))
+		}
+		x(fp.WriteString("  </metadata>\n  "))
+		x(fp.WriteString(stripMeta(s[i:])))
+		x(fp.Close())
+		x(os.Rename("xml/"+filename+".tmp", "xml/"+filename))
+	}
+}
+
+func doText(dirname, filename string) {
+	base := filename[:len(filename)-4]
+	if _, ok := data[base]; !ok {
+		data[base] = make([]string, 0)
+	}
+	fp, err := os.Open(dirname + filename)
+	x(err)
+	defer func() { x(fp.Close()) }()
+	scanner := bufio.NewScanner(fp)
+	for scanner.Scan() {
+		line := scanner.Text()
+		if !strings.HasPrefix(line, "##META") {
+			continue
+		}
+		aa := strings.Fields(line)
+		if len(aa) > 4 {
+			data[base] = append(data[base],
+				fmt.Sprintf(`<meta type="%s" name="%s" value="%s"/>`,
+					aa[1],
+					escape(aa[2]),
+					escape(strings.Join(aa[4:], " "))))
+		}
+	}
+	x(scanner.Err())
+}
+
+func doXml(dirname, filename string) {
+	base := filename[:len(filename)-4]
+	if _, ok := data[base]; !ok {
+		data[base] = make([]string, 0)
+	}
+	b, err := os.ReadFile(dirname + filename)
+	x(err)
+	var item Item
+	x(xml.Unmarshal(b, &item))
+	t := time.Unix(item.UnixTime, 0).In(location)
+	data[base] = append(data[base],
+		fmt.Sprintf(`<meta type="date" name="pubdate" value="%d-%02d-%02d"/>`,
+			t.Year(),
+			int(t.Month()),
+			t.Day()))
+}
+
+func stripMeta(s string) string {
+	i1 := strings.Index(s, "<metadata>")
+	if i1 < 0 {
+		return s
+	}
+	i2 := i1 + strings.Index(s[i1:], "</metadata>") + 11
+	return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n")
+}
--- a/Volkskrant/cmd/volkskrant/volkskrant.go
+++ b/Volkskrant/cmd/volkskrant/volkskrant.go
@@ -0,0 +1,390 @@
+package main
+
+import (
+	e "codeberg.org/pebbe/errors"
+	"github.com/jbowtie/gokogiri"
+
+	u "git.web.rug.nl/p209327/nlnieuws/internal/util"
+
+	//"encoding/json"
+	"encoding/xml"
+	"fmt"
+	//"html"
+	"io"
+	"net/http"
+	"net/url"
+	"os"
+	"strings"
+	"time"
+)
+
+type Rss struct {
+	XMLName xml.Name `xml:"rss"`
+	Items   []ItemT  `xml:"channel>item"`
+}
+
+type ItemT struct {
+	PubDate  string `xml:"pubDate"`
+	UnixTime int64  `xml:"unixTime"`
+	Guid     string `xml:"guid"`
+	Link     string `xml:"link"`
+	Title    string `xml:"title"`
+	Data     []byte `xml:",innerxml"`
+}
+
+/*
+type GraphT struct {
+	Graph []map[string]any `json:"@graph"`
+}
+*/
+
+var (
+	p     = e.PanicErr
+	w     = e.WarnErr
+	agent = "AhrefsBot/7.0"
+)
+
+func exists(filename string) bool {
+	_, err := os.Stat(filename)
+	return err == nil
+}
+
+func fileDate(filename string) string {
+	b, err := os.ReadFile(filename)
+	if err != nil {
+		return ""
+	}
+	s := string(b)
+	i1 := strings.Index(s, "<unixTime>") + 10
+	i2 := strings.Index(s, "</unixTime>")
+	if i2 < i1 {
+		return ""
+	}
+	return s[i1:i2]
+}
+
+func main() {
+	defer func() {
+		if e.Panicked {
+			_ = recover()
+			os.Exit(1)
+		}
+	}()
+
+	myLock := "/net/corpora/nlnieuws/Volkskrant/lock"
+	u.MkLock(myLock)
+	defer func() {
+		_ = os.Remove(myLock)
+	}()
+
+	req, err := http.NewRequest("GET", "https://www.volkskrant.nl/rss.xml", nil)
+	p(err)
+	req.Header.Set("User-Agent", agent)
+
+	client := &http.Client{}
+	resp, err := client.Do(req)
+	p(err)
+	body, err := io.ReadAll(resp.Body)
+	p(err)
+	p(resp.Body.Close())
+
+	var rss Rss
+	p(xml.Unmarshal(body, &rss))
+
+	if len(rss.Items) == 0 {
+		p(fmt.Errorf("len(rss.Items) == 0"))
+	}
+
+	for _, item := range rss.Items {
+		t, err := time.Parse(time.RFC1123Z, item.PubDate)
+		if err != nil {
+			t, err = time.Parse(time.RFC1123, item.PubDate)
+		}
+		p(err)
+		dirname := fmt.Sprintf("/net/corpora/nlnieuws/Volkskrant/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
+		if exists(dirname + "/lock") {
+			continue
+		}
+		basename := item.Guid
+		filename := dirname + "/" + url.PathEscape(basename)
+
+		ts := fmt.Sprintf("%d", t.Unix())
+		needUpdate := fileDate(filename+".xml") != ts
+
+		p(os.MkdirAll(dirname, 0777))
+		func() {
+			var ok bool
+			defer func() {
+				if e.Panicked {
+					fmt.Fprintln(os.Stderr, "----", filename)
+					fmt.Fprintln(os.Stderr, "----", item.Link)
+				}
+				if !ok {
+					_ = os.Remove(filename + ".xml")
+				}
+			}()
+			fp, err := os.Create(filename + ".xml")
+			p(err)
+			p(fp.WriteString("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<item>\n"))
+			p(fmt.Fprintf(fp, "<unixTime>%d</unixTime>", t.Unix()))
+			p(fp.Write(item.Data))
+			p(fp.WriteString("</item>\n"))
+			p(fp.Close())
+			p(os.Chtimes(filename+".xml", t, t))
+			ok = doArticle(filename, item.Link, item.Title, t, needUpdate)
+		}()
+	}
+}
+
+func doArticle(filename string, url string, title string, timestamp time.Time, needUpdate bool) (ok bool) {
+	if exists(filename + ".skip") {
+		return true
+	}
+	if needUpdate {
+		_ = os.Remove(filename + ".err")
+		_ = os.Remove(filename + ".html")
+		// _ = os.Remove(filename + ".json")
+		_ = os.Remove(filename + ".txt")
+	} else {
+		if exists(filename + ".txt") {
+			return true
+		}
+	}
+	time.Sleep(2 * time.Second)
+
+	req, err := http.NewRequest("GET", url, nil)
+	p(err)
+	req.Header.Set("User-Agent", agent)
+
+	client := &http.Client{}
+	resp, err := client.Do(req)
+	p(err)
+	body, err := io.ReadAll(resp.Body)
+	p(err)
+	p(resp.Body.Close())
+
+	body = u.HtmlFix(body)
+
+	doc, err := gokogiri.ParseHtml(body)
+	p(err)
+
+	/*
+
+		s := string(body)
+
+		ok = true
+		i1 := strings.Index(s, `<script type="application/ld+json"`)
+		if i1 < 0 {
+			ok = false
+		} else {
+			i1 += strings.Index(s[i1:], `>`) + 1
+			i2 := i1 + strings.Index(s[i1:], `</script>`)
+			if i2 < i1 {
+				ok = false
+			} else {
+				s = html.UnescapeString(s[i1:i2])
+			}
+		}
+		if !ok {
+			_ = w(fmt.Errorf("script jsonld not found: %s", url))
+
+			fp, err := os.Create(filename + ".err")
+			p(err)
+			p(fmt.Fprintf(fp, "script jsonld not found: %s\n", url))
+			p(fp.Close())
+			p(os.Chtimes(filename+".err", timestamp, timestamp))
+
+			fp, err = os.Create(filename + ".html")
+			p(err)
+			p(fp.Write(body))
+			p(fp.Close())
+			p(os.Chtimes(filename+".html", timestamp, timestamp))
+
+			return false
+		}
+
+		var graph GraphT
+		p(json.Unmarshal([]byte(s), &graph))
+		for _, g := range graph.Graph {
+			t := g["@type"]
+			switch v := t.(type) {
+			case string:
+				if v == "NewsArticle" {
+					b, err := json.Marshal(g)
+					p(err)
+					s = string(b)
+				}
+			}
+		}
+
+		fp, err := os.Create(filename + ".json")
+		p(err)
+		p(fp.WriteString(s))
+		p(fp.Close())
+		p(os.Chtimes(filename+".json", timestamp, timestamp))
+	*/
+
+	root := doc.Root()
+
+	articles, err := root.Search(`//article[@id="article-content"]`)
+	p(err)
+	if len(articles) == 0 {
+		_ = w(fmt.Errorf("empty: %s", url))
+
+		fp, err := os.Create(filename + ".err")
+		p(err)
+		p(fmt.Fprintf(fp, "empty: %s\n", url))
+		p(fp.Close())
+		p(os.Chtimes(filename+".err", timestamp, timestamp))
+
+		fp, err = os.Create(filename + ".html")
+		p(err)
+		p(fp.Write(body))
+		p(fp.Close())
+		p(os.Chtimes(filename+".html", timestamp, timestamp))
+
+		return false
+	}
+	article := articles[0]
+
+	live, err := article.Search(`.//*[@data-test-id="live-blog-label"]`)
+	p(err)
+	if len(live) > 0 {
+		fp, err := os.Create(filename + ".skip")
+		p(fp.WriteString("liveblog\n"))
+		p(err)
+		p(os.Chtimes(filename+".skip", timestamp, timestamp))
+		return true
+	}
+
+	headers, err := article.Search(`.//header`)
+	p(err)
+	if len(headers) == 0 {
+		_ = w(fmt.Errorf("no header: %s", url))
+
+		fp, err := os.Create(filename + ".err")
+		p(err)
+		p(fmt.Fprintf(fp, "no elements: %s\n", url))
+		p(fp.Close())
+		p(os.Chtimes(filename+".err", timestamp, timestamp))
+
+		fp, err = os.Create(filename + ".html")
+		p(err)
+		p(fp.Write(body))
+		p(fp.Close())
+		p(os.Chtimes(filename+".html", timestamp, timestamp))
+
+		return false
+
+	}
+	header := headers[0]
+
+	isOpinie := false
+	isColumn := false
+	tags := make([]string, 0)
+	ell, err := header.Search(`.//*[@data-test-id="article-label"]`)
+	p(err)
+	if len(ell) == 0 {
+		_ = w(fmt.Errorf("no labels: %s", url))
+	}
+	for _, el := range ell {
+		s := strings.TrimSpace(el.Content())
+		if s != "" && s != "Nieuws" {
+			tags = append(tags, s)
+		}
+		if s1 := strings.ToLower(s); s1 == "opinie" {
+			isOpinie = true
+		} else if s1 == "column" {
+			isColumn = true
+		}
+	}
+
+	pars := make([]string, 0)
+
+	found := false
+	ell, err = header.Search(`.//*[@data-test-id="header-intro"]`)
+	p(err)
+	for _, el := range ell {
+		s := strings.TrimSpace(el.Content())
+		if s != "" {
+			pars = append(pars, s)
+			found = true
+		}
+	}
+	if !found && !isOpinie && !isColumn {
+		_ = w(fmt.Errorf("no intro: %s", url))
+	}
+
+	specials, err := article.Search(`.//aside | .//figure | .//figcaption | .//section//b`)
+	p(err)
+	for i := len(specials) - 1; i >= 0; i-- {
+		specials[i].Remove()
+	}
+
+	ell, err = article.Search(`.//section//*[@data-article-element-index]`)
+	p(err)
+	if len(ell) == 0 {
+		_ = w(fmt.Errorf("no elements: %s", url))
+
+		fp, err := os.Create(filename + ".err")
+		p(err)
+		p(fmt.Fprintf(fp, "no elements: %s\n", url))
+		p(fp.Close())
+		p(os.Chtimes(filename+".err", timestamp, timestamp))
+
+		fp, err = os.Create(filename + ".html")
+		p(err)
+		p(fp.Write(body))
+		p(fp.Close())
+		p(os.Chtimes(filename+".html", timestamp, timestamp))
+
+		return false
+	}
+
+	found = false
+	for _, el := range ell {
+		s := strings.TrimSpace(el.Content())
+		if s != "" {
+			pars = append(pars, s)
+			found = true
+		}
+	}
+	if !found {
+		_ = w(fmt.Errorf("no text, skipping: %s", url))
+		fp, err := os.Create(filename + ".skip")
+		p(fp.WriteString(url + "\n"))
+		p(err)
+		p(os.Chtimes(filename+".skip", timestamp, timestamp))
+
+		fp, err = os.Create(filename + ".html")
+		p(err)
+		p(fp.Write(body))
+		p(fp.Close())
+		p(os.Chtimes(filename+".html", timestamp, timestamp))
+
+		return true
+	}
+
+	fp, err := os.Create(filename + ".txt")
+	p(err)
+
+	if len(tags) == 0 {
+		p(fmt.Fprintln(fp, "##META text tag ="))
+	} else {
+		for _, tag := range tags {
+			p(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag)))
+		}
+	}
+
+	p(fp.WriteString(u.AddEnd(u.FixSpace(title))))
+
+	for _, par := range pars {
+		p(fp.WriteString(u.AddEnd(u.FixSpace(par))))
+	}
+
+	p(fp.Close())
+
+	p(os.Chtimes(filename+".txt", timestamp, timestamp))
+
+	return true
+}
--- a/Volkskrant/txt2corpus.sh
+++ b/Volkskrant/txt2corpus.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+
+set -e
+
+BASE=/net/corpora/nlnieuws
+PART=$BASE/Volkskrant
+
+unset CDPATH
+PATH=$PART:$BASE/bin:$BASE:/net/aps/bin:$PATH
+export TZ=Europe/Amsterdam
+. /net/aps/etc/alpino-activate.sh > /dev/null
+
+if [ "$1" = "" ]
+then
+    ds=`date -d -2days +%Y-%m-%d`
+else
+    case "$1" in
+        2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])
+            ds=$1
+            ;;
+        *)
+            echo INVALID
+            exit 1
+            ;;
+    esac
+fi
+
+dp=${ds//-//}
+year=${ds%%-*}
+corpus=$PART/corpus/$year/$ds
+mkdir -p $PART/corpus/$year
+
+cd $PART/$dp
+
+ln -s lock.$$ lock
+if [ "`readlink lock`" != lock.$$ ]
+then
+    echo Getting lock failed
+    exit 1
+fi
+
+rm -fr out
+mkdir out
+
+rm -f $corpus.lines
+for i in *.txt
+do
+    b=`basename $i .txt`
+    perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \
+        | perl -e '$n = 0; while(<>) { $n++; print("vk.'$b'.$n|$_"); }' \
+        >> $corpus.lines
+done
+
+cd out
+mkdir xml
+Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
+
+metadata
+
+cd xml
+rm -f $corpus.data.dz $corpus.index
+alto -q -o $corpus.data.dz *.xml
+
+# telling per bericht, niet per zin
+query.sh -x T -s $corpus.data.dz > $corpus.tag.txt
+
+cd ../..
+rm -fr out
+
+rm -f lock
--- a/cmd/data2json/data2json.go
+++ b/cmd/data2json/data2json.go
@@ -7,6 +7,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"os"
+	"regexp"
 	"strconv"
 	"strings"
 	"time"
@@ -49,13 +50,14 @@ var (
 	parts = map[string]struct {
 		file   string
 		suffix string
+		re     *regexp.Regexp
 	}{
-		"nieuwe namen":   {"nieuwe-namen", ".t20"},
-		"nieuwe woorden": {"nieuwe-woorden-extra", ".t20"},
-		"personen":       {"personen", ""},
-		"andere namen":   {"overige-namen", ""},
-		"locaties":       {"locaties", ""},
-		"organisaties":   {"organisaties", ""},
+		"nieuwe namen":   {"nieuwe-namen", ".t20", nil},
+		"nieuwe woorden": {"nieuwe-woorden-extra", ".t20", nil},
+		"personen":       {"personen", "", nil},
+		"andere namen":   {"overige-namen", "", nil},
+		"locaties":       {"locaties", "", nil},
+		"organisaties":   {"organisaties", "", regexp.MustCompile(`^(ANP|AT5)`)},
 	}

 	maanden = strings.Fields("x januari februari maart april mei juni juli augustus september oktober november december")
@@ -70,9 +72,9 @@ var (

 func main() {

-	aa := strings.Split(os.Args[1], "-")
+	aa := strings.Split(os.Args[1], ".")
 	if len(aa) != 2 {
-		x(fmt.Errorf("ongeldig argument, moet in formaat yyyy-dd zijn"))
+		x(fmt.Errorf("ongeldig argument, moet in formaat yyyy.dd zijn"))
 	}

 	var err error
@@ -128,7 +130,8 @@ func makeParts(source string) *Parts {
 func makeValues(source, part string) [][5]any {
 	v := make([][5]any, 0)

-	filename := fmt.Sprintf("/net/corpora/nlnieuws/data/%s-%s-%d-%02d-%d%s",
+	filename := fmt.Sprintf("/net/corpora/nlnieuws/data/%d/%s-%s-%d.%02d-%d%s",
+		year,
 		sources[source],
 		parts[part].file,
 		year,
@@ -141,12 +144,15 @@ func makeValues(source, part string) [][5]any {
 	scanner := bufio.NewScanner(fp)
 	lineno := 0
 	for scanner.Scan() {
-		lineno++
 		line := scanner.Text()
 		aa := strings.Split(line, "\t")
 		count, err := strconv.Atoi(strings.TrimSpace(aa[0]))
 		x(err)
 		word := aa[1]
+		if parts[part].re != nil && parts[part].re.MatchString(word) {
+			continue
+		}
+		lineno++
 		var tags, lemma, postag string
 		if len(aa) > 2 {
 			tags = aa[2]
@@ -226,7 +232,7 @@ func dates() (start, first, last string, names []string) {
 	t3 := tStart
 	for range size {
 		y, w := t3.ISOWeek()
-		names = append(names, fmt.Sprintf("%d/%02d", y, w))
+		names = append(names, fmt.Sprintf("%d/w%02d", y, w))
 		t3 = t3.AddDate(0, 0, 7)
 	}
 	t3 = tStart
--- a/cmd/dates2json/dates2json.go
+++ b/cmd/dates2json/dates2json.go
@@ -26,13 +26,24 @@ var (

 func main() {

-	files, err := os.ReadDir("/net/corpora/nlnieuws/data/json")
+	dirs, err := os.ReadDir("/net/corpora/nlnieuws/data/json")
 	x(err)
+	for _, dir := range dirs {
+		if !dir.IsDir() {
+			continue
+		}
+		dirname := dir.Name()
+		if dirname[0] != '2' {
+			continue
+		}
+		files, err := os.ReadDir("/net/corpora/nlnieuws/data/json/" + dirname)
+		x(err)

-	for _, file := range files {
-		filename := file.Name()
-		if strings.HasPrefix(filename, "DATA-") && strings.HasSuffix(filename, "-4.json") {
-			addWeek(filename[5:12])
+		for _, file := range files {
+			filename := file.Name()
+			if strings.HasPrefix(filename, "DATA-") && strings.HasSuffix(filename, "-4.json") {
+				addWeek(filename[5:12])
+			}
 		}
 	}

--- a/cmd/rang/rang.go
+++ b/cmd/rang/rang.go
@@ -0,0 +1,63 @@
+package main
+
+// alto 'fp://node[....]' 'tt:%w\t%I' $files | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang
+
+import (
+	e "codeberg.org/pebbe/errors"
+
+	"bufio"
+	"fmt"
+	"os"
+	"sort"
+	"strings"
+)
+
+var (
+	x = e.ExitErr
+)
+
+type Item struct {
+	word  string
+	count int
+}
+
+func main() {
+
+	counts := make(map[string]int)
+
+	scanner := bufio.NewScanner(os.Stdin)
+	for scanner.Scan() {
+		word := strings.Split(scanner.Text(), "\t")[0]
+		counts[word] = counts[word] + 1
+	}
+	x(scanner.Err())
+
+	items := make([]Item, 0)
+	for key, value := range counts {
+		items = append(items, Item{
+			word:  key,
+			count: value,
+		})
+	}
+
+	sort.Slice(items, func(a, b int) bool {
+		if items[a].count == items[b].count {
+			return items[a].word < items[b].word
+		}
+		return items[a].count > items[b].count
+	})
+
+	rang := 0
+	prev := 0
+	for _, item := range items {
+		if item.count < 2 {
+			break
+		}
+		if item.count != prev {
+			rang++
+			prev = item.count
+		}
+		fmt.Printf("%d\t%s\n", rang, item.word)
+	}
+
+}
--- a/cmd/score/score.go
+++ b/cmd/score/score.go
@@ -1,114 +0,0 @@
-package main
-
-import (
-	e "codeberg.org/pebbe/errors"
-
-	"bufio"
-	"fmt"
-	"os"
-	"regexp"
-	"sort"
-	"strconv"
-	"strings"
-)
-
-type Item struct {
-	text   string
-	lctext string
-	score  int
-	isnew  bool
-}
-
-var (
-	x          = e.ExitErr
-	reYearWeek = regexp.MustCompile(`(.*)([12][0-9][0-9][0-9]-[0-5][0-9])(.*)`)
-	count      = make(map[string]int)
-	items      = make([]Item, 0)
-)
-
-func main() {
-	filename := os.Args[1]
-	prevname := getPrev(filename)
-
-	fp, err := os.Open(prevname)
-	x(err)
-	scanner := bufio.NewScanner(fp)
-	for scanner.Scan() {
-		aa := strings.SplitN(scanner.Text(), "\t", 2)
-		n, err := strconv.Atoi(strings.TrimSpace(aa[0]))
-		x(err)
-		count[aa[1]] = n
-	}
-	x(scanner.Err())
-	x(fp.Close())
-
-	fp, err = os.Open(filename)
-	x(err)
-	scanner = bufio.NewScanner(fp)
-	for scanner.Scan() {
-		aa := strings.SplitN(scanner.Text(), "\t", 2)
-		n, err := strconv.Atoi(strings.TrimSpace(aa[0]))
-		x(err)
-		n1, ok := count[aa[1]]
-		items = append(items, Item{
-			text:   aa[1],
-			lctext: strings.ToLower(aa[1]),
-			score:  n - n1,
-			isnew:  !ok,
-		})
-	}
-	x(scanner.Err())
-	x(fp.Close())
-
-	sort.Slice(items, func(i, j int) bool {
-		/*
-			if items[i].isnew && !items[j].isnew {
-				return true
-			}
-			if !items[i].isnew && items[j].isnew {
-				return false
-			}
-		*/
-		if items[i].score != items[j].score {
-			return items[i].score > items[j].score
-		}
-		return items[i].lctext < items[j].lctext
-	})
-
-	for _, item := range items {
-		/*
-			if item.score < 2 {
-				break
-			}
-		*/
-		p := "."
-		if item.isnew {
-			p = "N"
-		}
-		fmt.Printf("%s\t%4d\t%s\n", p, item.score, item.text)
-	}
-
-}
-
-func getPrev(filename string) string {
-	mm := reYearWeek.FindStringSubmatch(filename)
-	year, err := strconv.Atoi(mm[2][:4])
-	x(err)
-	week, err := strconv.Atoi(mm[2][5:])
-	x(err)
-
-	week--
-	if week == 0 {
-		week = 53
-		year--
-	}
-	newname := fmt.Sprintf("%s%d-%02d%s", mm[1], year, week, mm[3])
-	if week == 53 {
-		_, err := os.Stat(newname)
-		if err == nil {
-			return newname
-		}
-		newname = fmt.Sprintf("%s%d-%02d%s", mm[1], year, week-1, mm[3])
-	}
-	return newname
-}
--- a/cmd/top20/top20.go
+++ b/cmd/top20/top20.go
@@ -11,7 +11,7 @@ import (

 var (
 	x      = e.ExitErr
-	reFile = regexp.MustCompile(`(.*)(2[0-9][0-9][0-9]-[0-5][0-9])(.*)`)
+	reFile = regexp.MustCompile(`(.*)(2[0-9][0-9][0-9]\.[0-5][0-9])(.*)`)
 	seen   = make(map[string]bool)
 )

@@ -23,21 +23,30 @@ func main() {
 	suffix := m[3] + ".t20"
 	target := infile + ".t20"

-	x(os.Chdir("/net/corpora/nlnieuws/data"))
-
-	files, err := os.ReadDir(".")
+	dirs, err := os.ReadDir("..")
 	x(err)
-	for _, file := range files {
-		name := file.Name()
-		if strings.HasPrefix(name, prefix) && strings.HasSuffix(name, suffix) && name < target {
-			fp, err := os.Open(name)
-			x(err)
-			scanner := bufio.NewScanner(fp)
-			for scanner.Scan() {
-				seen[strings.Split(scanner.Text(), "\t")[1]] = true
+	for _, dir := range dirs {
+		if !dir.IsDir() {
+			continue
+		}
+		dirname := dir.Name()
+		if dirname[0] != '2' {
+			continue
+		}
+		files, err := os.ReadDir("../" + dirname)
+		x(err)
+		for _, file := range files {
+			name := file.Name()
+			if strings.HasPrefix(name, prefix) && strings.HasSuffix(name, suffix) && name < target {
+				fp, err := os.Open("../" + dirname + "/" + name)
+				x(err)
+				scanner := bufio.NewScanner(fp)
+				for scanner.Scan() {
+					seen[strings.Split(scanner.Text(), "\t")[1]] = true
+				}
+				x(scanner.Err())
+				x(fp.Close())
 			}
-			x(scanner.Err())
-			x(fp.Close())
 		}
 	}

--- a/cmd/trends/trends.go
+++ b/cmd/trends/trends.go
@@ -0,0 +1,108 @@
+package main
+
+import (
+	e "codeberg.org/pebbe/errors"
+
+	"bufio"
+	"fmt"
+	"os"
+	"sort"
+	"strconv"
+	"strings"
+)
+
+type Item struct {
+	word string
+	diff float64
+	gone bool
+}
+
+var (
+	x = e.ExitErr
+)
+
+func main() {
+
+	refs := make(map[string]int)
+	refmax := 0
+	fp, err := os.Open(os.Args[1])
+	x(err)
+	scanner := bufio.NewScanner(fp)
+	for scanner.Scan() {
+		aa := strings.Split(scanner.Text(), "\t")
+		n, err := strconv.Atoi(aa[0])
+		x(err)
+		refs[aa[1]] = n
+		if n > refmax {
+			refmax = n
+		}
+	}
+	x(scanner.Err())
+	fp.Close()
+	refmax++
+
+	lines := make([]string, 0)
+	fp, err = os.Open(os.Args[2])
+	x(err)
+	scanner = bufio.NewScanner(fp)
+	for scanner.Scan() {
+		lines = append(lines, scanner.Text())
+	}
+	x(scanner.Err())
+	fp.Close()
+
+	curmax, err := strconv.Atoi(strings.Split(lines[len(lines)-1], "\t")[0])
+	x(err)
+	curmax++
+
+	items := make([]Item, 0)
+	seen := make(map[string]bool)
+
+	for _, line := range lines {
+		aa := strings.Split(line, "\t")
+		seen[aa[1]] = true
+		n, err := strconv.Atoi(aa[0])
+		x(err)
+		m, ok := refs[aa[1]]
+		if !ok {
+			//continue
+			m = refmax
+		}
+		diff := float64(m)/float64(refmax) - float64(n)/float64(curmax)
+		if diff > 0.05 || diff < -0.05 {
+			items = append(items, Item{
+				word: aa[1],
+				diff: diff,
+			})
+		}
+	}
+
+	for key, value := range refs {
+		if !seen[key] {
+			diff := float64(value)/float64(refmax) - 1.0
+			if diff > 0.05 || diff < -0.05 {
+				items = append(items, Item{
+					word: key,
+					diff: diff,
+					gone: true,
+				})
+			}
+		}
+	}
+
+	sort.Slice(items, func(a, b int) bool {
+		if items[a].diff == items[b].diff {
+			return items[a].word < items[b].word
+		}
+		return items[a].diff > items[b].diff
+	})
+
+	for _, item := range items {
+		var s string
+		if item.gone {
+			s = "X"
+		}
+		fmt.Printf("%f\t%s\t%s\n", item.diff, s, item.word)
+	}
+
+}
--- a/cmd/week2files/week2files.go
+++ b/cmd/week2files/week2files.go
@@ -15,9 +15,9 @@ var (
 )

 func main() {
-	aa := strings.Split(os.Args[1], "-")
+	aa := strings.Split(os.Args[1], ".")
 	if len(aa) != 2 {
-		x(fmt.Errorf("ongeldig argument, moet in formaat yyyy-dd zijn"))
+		x(fmt.Errorf("ongeldig argument, moet in formaat yyyy.dd zijn"))
 	}

 	year, err := strconv.Atoi(aa[0])
@@ -58,7 +58,7 @@ func main() {
 			fmt.Print(" -or")
 		}
 		y, w := t2.ISOWeek()
-		fmt.Printf(" -name %d-%02d.data.dz", y, w)
+		fmt.Printf(" -name %d.%02d.data.dz", y, w)
 		t2 = t2.AddDate(0, 0, 7)
 	}

--- a/cmd/weekadd/weekadd.go
+++ b/cmd/weekadd/weekadd.go
@@ -0,0 +1,50 @@
+package main
+
+import (
+	ww "git.web.rug.nl/p209327/nlnieuws/internal/week"
+
+	e "codeberg.org/pebbe/errors"
+
+	"fmt"
+	"io"
+	"os"
+	"strconv"
+	"strings"
+)
+
+var (
+	x = e.ExitErr
+)
+
+func syntax(w io.Writer) {
+	fmt.Fprintf(w, `
+Usage: %s yyyy.ww n
+
+`, os.Args[0])
+}
+
+func main() {
+	if len(os.Args) != 3 {
+		syntax(os.Stdout)
+		return
+	}
+
+	aa := strings.Split(os.Args[1], ".")
+	if len(aa) != 2 {
+		syntax(os.Stderr)
+		return
+	}
+	year, err := strconv.Atoi(aa[0])
+	x(err)
+	week, err := strconv.Atoi(aa[1])
+	x(err)
+
+	n, err := strconv.Atoi(os.Args[2])
+	x(err)
+
+	t, err := ww.WeekDate(year, week)
+	x(err)
+
+	y, w := t.AddDate(0, 0, n*7).ISOWeek()
+	fmt.Printf("%d.%02d\n", y, w)
+}
--- a/collect.sh
+++ b/collect.sh
@@ -3,8 +3,9 @@
 set -e

 unset CDPATH
-PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
+PATH=/net/corpora/nlnieuws/bin:/net/corpora/nlnieuws/python:/net/aps/bin:$PATH
 export TZ=Europe/Amsterdam
+. /net/corpora/nlnieuws/python/env/bin/activate

 verbose=0
 if [ "$1" = "-v" ]
@@ -22,10 +23,10 @@ say () {

 if [ "$1" = "" ]
 then
-    ds=`date -d -7days +%G-%V`
+    ds=`date -d -7days +%G.%V`
 else
    case "$1" in
-        2[0-9][0-9][0-9]-[0-5][0-9])
+        2[0-9][0-9][0-9].[0-5][0-9])
            ds=$1
            ;;
        *)
@@ -35,11 +36,17 @@ else
    esac
 fi

-cd /net/corpora/nlnieuws/data
+ds1=$(weekadd $ds -1)
+
+year=${ds%%.*}
+
+mkdir -p /net/corpora/nlnieuws/data/$year
+mkdir -p /net/corpora/nlnieuws/data/json/$year
+cd /net/corpora/nlnieuws/data/$year

 declare -A parts
 #parts[alles]='.'
-parts[algemeen]='NOS|NU|NieuwsNL|RO|Sargasso'
+parts[algemeen]='NOS|NU|NieuwsNL|RO|Sargasso|Volkskrant'
 parts[amsterdam]='AT5|BuurtAdam|Parool'
 parts[groningen]='BuurtGrn|GG|Oog|RTVNoord|Sikkom'
 parts[literatuur]='LitNL|Tzum'
@@ -60,6 +67,7 @@ parts[vlaanderen]='HLN|VRT'
 #parts[Sargasso]='Sargasso'
 #parts[Sikkom]='Sikkom'
 #parts[Tzum]='Tzum'
+#parts[Volkskrant]='Volkskrant'
 #parts[VRT]='VRT'

 for part in ${!parts[@]}
@@ -68,7 +76,7 @@ do

    for i in 1 4
    do
-        files=$(find .. $(week2files $ds $i) | grep -E "$regex") || true
+        files=$(find ../.. $(week2files $ds $i) | grep -E "$regex") || true
        if [ -z "$files" ]
        then
            continue
@@ -158,10 +166,48 @@ do
            | sed -e 's/\([0-9]\)  */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
            > $part-nieuwe-adjww-extra-$ds-$i

+        # kale tellingen
+
+        say $part-allewoorden-$ds-$i
+        alto \
+            'fp://node[(@pt and not(@pt="let" or @rel="mwp" or @neclass)) or (@cat="mwu" and not(.//node[@neclass]))]' \
+            'tt:%l\t%I' $files \
+            | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
+            | sed -e 's/\t.*//' | uniq -c \
+            | grep -v '^ *1 ' \
+            | sed -e 's/\([0-9]\)  */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
+            > $part-allewoorden-$ds-$i
+
    done
+
+    # score
+    say $part-allewoorden-$ds.score14
+    stijgers.py $part-allewoorden-$ds1-4 $part-allewoorden-$ds-1 $part-allewoorden-$ds.score14
+    stijgers2json.py $part-allewoorden-$ds1-4 $part-allewoorden-$ds-1 $part-allewoorden-$ds.score14.json
+
+    say $part-locaties-$ds.score14
+    stijgers.py $part-locaties-$ds1-4 $part-locaties-$ds-1 $part-locaties-$ds.score14
+    stijgers2json.py $part-locaties-$ds1-4 $part-locaties-$ds-1 $part-locaties-$ds.score14.json
+
+    say $part-personen-$ds.score14
+    stijgers.py $part-personen-$ds1-4 $part-personen-$ds-1 $part-personen-$ds.score14
+    stijgers2json.py $part-personen-$ds1-4 $part-personen-$ds-1 $part-personen-$ds.score14.json
+
+    say $part-organisaties-$ds.score14
+    stijgers.py $part-organisaties-$ds1-4 $part-organisaties-$ds-1 $part-organisaties-$ds.score14
+    stijgers2json.py $part-organisaties-$ds1-4 $part-organisaties-$ds-1 $part-organisaties-$ds.score14.json
+
+    say $part-overige-namen-$ds.score14
+    stijgers.py $part-overige-namen-$ds1-4 $part-overige-namen-$ds-1 $part-overige-namen-$ds.score14
+    stijgers2json.py $part-overige-namen-$ds1-4 $part-overige-namen-$ds-1 $part-overige-namen-$ds.score14.json
+
 done

-data2json $ds 1 > json/DATA-$ds-1.json
-data2json $ds 4 > json/DATA-$ds-4.json
-dates2json > json/index1.json
-dates2json > json/index4.json
+data2json $ds 1 > ../json/$year/DATA-$ds-1.json
+data2json $ds 4 > ../json/$year/DATA-$ds-4.json
+dates2json > ../json/index1.json
+dates2json > ../json/index4.json
+
+# rechten bijwerken
+chmod -R g+w /net/corpora/nlnieuws
+chgrp -R software /net/corpora/nlnieuws
--- a/go.mod
+++ b/go.mod
@@ -1,11 +1,13 @@
-module nlnieuws
+module git.web.rug.nl/p209327/nlnieuws

 go 1.26.1

 require (
 	codeberg.org/pebbe/errors v0.4.0
 	github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5
+	github.com/pebbe/compactcorpus v1.0.3
 	github.com/pebbe/textcat/v2 v2.3.0
+	github.com/rug-compling/alpinods v1.18.1
 )

 require github.com/pebbe/util v0.9.0 // indirect
--- a/go.sum
+++ b/go.sum
@@ -2,7 +2,11 @@ codeberg.org/pebbe/errors v0.4.0 h1:G05wsXpC/LRPaL02QYDwtz0sWFWQcIWK1s+MC79LBzU=
 codeberg.org/pebbe/errors v0.4.0/go.mod h1:O7PPxUJM1bWRHq11CRK3wqVaH/3NnRaSVZvh3UhzDCY=
 github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5 h1:tQbR4RKFBFi0+Ll69dXejKKUbQVNaOAT2fjlDvSAfx4=
 github.com/jbowtie/gokogiri v0.0.0-20250107075044-de0f9d4877a5/go.mod h1:kQE2lxPgVKe0JsBZMFFfMm5zBDCuRhaHFKOBzZeCLiw=
+github.com/pebbe/compactcorpus v1.0.3 h1:6qlfXKHTKg7oWKLPCgEgv1scplfvphg/9l9XiRT2HzQ=
+github.com/pebbe/compactcorpus v1.0.3/go.mod h1:SSpTeCZataCjjs82RJb8SOGdjkB3PlR7Z19EY4rInoQ=
 github.com/pebbe/textcat/v2 v2.3.0 h1:RB2egIQgI2a2Ls+I9No6KFQKCZBIFt8Cc/SWCnVtC7Y=
 github.com/pebbe/textcat/v2 v2.3.0/go.mod h1:WLXWuL+fOlQJqn6LmubjD+e78hCC6Y/rAWInh0wq/kg=
 github.com/pebbe/util v0.9.0 h1:PMZd+CpWb8GbWEmFGlL3qd6XPuywl6xFIbrXWi870OA=
 github.com/pebbe/util v0.9.0/go.mod h1:ynWl/SFX4+Seb9fpjVlYevr1f4TP7FrCmyZHiBCg69Q=
+github.com/rug-compling/alpinods v1.18.1 h1:BvPcCnNEQ1QoVSc0RmwJd3kZmvo4iqZ52/vFzVvFS7w=
+github.com/rug-compling/alpinods v1.18.1/go.mod h1:R3BBX8RIw9InVqHZ+1W+MsX8WX8uBkoVNNGE38mqF1Q=
--- a/internal/util/util.go
+++ b/internal/util/util.go
@@ -0,0 +1,74 @@
+package util
+
+import (
+	e "codeberg.org/pebbe/errors"
+
+	"fmt"
+	"os"
+	"path/filepath"
+	"regexp"
+	"strings"
+)
+
+var (
+	p           = e.PanicErr
+	reEOL       = regexp.MustCompile(`[.!?]['"”’]?$`)
+	reNEOL      = regexp.MustCompile(`[.!?]['"”’]?\p{Lu}\p{Ll}+\.?`)
+	reLET       = regexp.MustCompile(`\p{Lu}`)
+	reBody      = regexp.MustCompile(`<[bB][rR][ /]*>`)
+	reQuotLeft  = regexp.MustCompile(`<em>|<i>`)
+	reQuotRight = regexp.MustCompile(`</em>|</i>`)
+)
+
+func HtmlFix(html []byte) []byte {
+	html = reQuotLeft.ReplaceAllLiteral(html, []byte(" „"))
+	html = reQuotRight.ReplaceAllLiteral(html, []byte("” "))
+	return reBody.ReplaceAllLiteral(html, []byte(" "))
+}
+
+func HtmlFixString(html string) string {
+	html = reQuotLeft.ReplaceAllLiteralString(html, " „")
+	html = reQuotRight.ReplaceAllLiteralString(html, "” ")
+	return reBody.ReplaceAllLiteralString(html, " ")
+}
+
+func AddEnd(s string) string {
+	s = strings.TrimSpace(s)
+	if s == "" {
+		return ""
+	}
+	if reEOL.MatchString(s) {
+		return s + "\n"
+	}
+	return s + ".\n"
+}
+
+func FixSpace(s string, opt ...bool) string {
+	s = strings.Join(strings.Fields(s), " ")
+
+	if len(opt) > 0 && opt[0] {
+		s = reNEOL.ReplaceAllStringFunc(s, func(s1 string) string {
+			if strings.HasSuffix(s1, ".") {
+				// zoals: v.Chr.
+				return s1
+			}
+			i := reLET.FindStringIndex(s1)[0]
+			return s1[:i] + " " + s1[i:]
+		})
+	}
+
+	return s
+}
+
+func MkLock(filename string) {
+	pid := os.Getpid()
+	link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
+	p(os.Symlink(link, filename))
+
+	name, err := os.Readlink(filename)
+	p(err)
+
+	if name != link {
+		p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
+	}
+}
--- a/internal/week/week.go
+++ b/internal/week/week.go
@@ -0,0 +1,30 @@
+package week
+
+import (
+	"fmt"
+	"time"
+)
+
+// returns first date of week
+func WeekDate(year, week int) (time.Time, error) {
+	// 1 januari
+	t := time.Date(year, 1, 1, 12, 0, 0, 0, time.UTC)
+
+	// zoek eerste donderdag
+	day := int(t.Weekday()) // 0 = zondag
+	donderdag := 4 - day
+	if donderdag < 0 {
+		donderdag = donderdag + 7
+	}
+
+	// schuif naar maandag voor eerste donderdag: donderdag - 3
+	// in de gewenste week: 7 * (week - 1)
+	t = t.AddDate(0, 0, donderdag-3+7*(week-1))
+
+	y, w := t.ISOWeek()
+	if y != year || w != week {
+		return t, fmt.Errorf("week number out of range")
+	}
+
+	return t, nil
+}
--- a/oud/fix.go
+++ b/oud/fix.go
@@ -0,0 +1,66 @@
+package main
+
+/*
+Dit past corpora aan
+
+Tags verwijderen:
+
+Oog: Nieuws
+Parool: Nieuws
+RO: Artikelen, cafeyn
+RTVNoord: br_*
+Tzum: Nieuws
+
+Tags veranderen:
+
+RTVNoord: tr_* → *
+
+*/
+
+import (
+	e "codeberg.org/pebbe/errors"
+	cc "github.com/pebbe/compactcorpus"
+	"github.com/rug-compling/alpinods"
+
+	"encoding/xml"
+	"fmt"
+	"os"
+	"strings"
+)
+
+var (
+	x = e.ExitErr
+)
+
+func main() {
+	for _, file := range os.Args[1:] {
+		base := strings.TrimSuffix(file, ".data.dz")
+		newfile := base + "-new.data.dz"
+
+		incc, err := cc.Open(file)
+		x(err)
+		outcc, err := cc.NewCorpus(newfile)
+		x(err)
+		r, err := incc.NewRange()
+		x(err)
+		for r.HasNext() {
+			name, data := r.Next()
+			fmt.Printf("%s  %s           \r", base, name)
+			var alpino alpinods.AlpinoDS
+			x(xml.Unmarshal(data, &alpino))
+			for i := 0; i < len(alpino.Metadata.Meta); i++ {
+				if alpino.Metadata.Meta[i].Name != "tag" {
+					continue
+				}
+				if n := alpino.Metadata.Meta[i].Value; n == "Nieuws" || n == "Artikelen" || n == "cafeyn" || strings.HasPrefix(n, "br_") {
+					alpino.Metadata.Meta = append(alpino.Metadata.Meta[:i], alpino.Metadata.Meta[i+1:]...)
+					i--
+				} else if strings.HasPrefix(n, "tr_") {
+					alpino.Metadata.Meta[i].Value = n[3:]
+				}
+			}
+			outcc.Write(name, []byte(alpino.String()))
+		}
+		x(outcc.Close())
+	}
+}
--- a/oud/rang/collect-rang.sh
+++ b/oud/rang/collect-rang.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+
+set -e
+
+unset CDPATH
+PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
+export TZ=Europe/Amsterdam
+
+verbose=0
+if [ "$1" = "-v" ]
+then
+    shift
+    verbose=1
+fi
+
+say () {
+    if [ "$verbose" = "1" ]
+    then
+        echo "$*"
+    fi
+}
+
+if [ "$1" = "" ]
+then
+    ds=`date -d -7days +%G.%V`
+else
+    case "$1" in
+        2[0-9][0-9][0-9].[0-5][0-9])
+            ds=$1
+            ;;
+        *)
+            echo INVALID
+            exit 1
+            ;;
+    esac
+fi
+
+year=${ds%%.*}
+
+mkdir -p /net/corpora/nlnieuws/data/$year
+mkdir -p /net/corpora/nlnieuws/data/json/$year
+cd /net/corpora/nlnieuws/data/$year
+
+declare -A parts
+#parts[alles]='.'
+parts[algemeen]='NOS|NU|NieuwsNL|RO|Sargasso|Volkskrant'
+#parts[amsterdam]='AT5|BuurtAdam|Parool'
+#parts[groningen]='BuurtGrn|GG|Oog|RTVNoord|Sikkom'
+#parts[literatuur]='LitNL|Tzum'
+#parts[vlaanderen]='HLN|VRT'
+#parts[AT5]='AT5'
+#parts[BuurtAdam]='BuurtAdam'
+#parts[BuurtGrn]='BuurtGrn'
+#parts[GG]='GG'
+#parts[HLN]='HLN'
+#parts[LitNL]='LitNL'
+#parts[NOS]='NOS'
+#parts[NU]='NU'
+#parts[NieuwsNL]='NieuwsNL'
+#parts[Oog]='Oog'
+#parts[Parool]='Parool'
+#parts[RO]='RO'
+#parts[RTVNoord]='RTVNoord'
+#parts[Sargasso]='Sargasso'
+#parts[Sikkom]='Sikkom'
+#parts[Tzum]='Tzum'
+#parts[Volkskrant]='Volkskrant'
+#parts[VRT]='VRT'
+
+for part in ${!parts[@]}
+do
+    regex=${parts[$part]}
+
+    for i in 1 4
+    do
+        files=$(find ../.. $(week2files $ds $i) | grep -E "$regex") || true
+        if [ -z "$files" ]
+        then
+            continue
+        fi
+
+        say $part-rang-word-$ds-$i
+        alto \
+            'fp://node[(@pt and not(@pt="let" or @rel="mwp" or @neclass)) or (@cat="mwu" and not(.//node[@neclass]))]' \
+            'tt:%l\t%I' $files \
+            | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
+            > $part-rang-word-$ds-$i
+
+        say $part-rang-loc-$ds-$i
+        alto \
+            'fp://node[(@neclass="LOC"  and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="LOC" ])]' \
+            'tt:%l\t%I' $files \
+            | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
+            > $part-rang-loc-$ds-$i
+
+        say $part-rang-per-$ds-$i
+        alto \
+            'fp://node[(@neclass="PER"  and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="PER" ])]' \
+            'tt:%l\t%I' $files \
+            | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
+            > $part-rang-per-$ds-$i
+
+        say $part-rang-org-$ds-$i
+        alto \
+            'fp://node[(@neclass="ORG"  and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="ORG" ])]' \
+            'tt:%l\t%I' $files \
+            | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
+            > $part-rang-org-$ds-$i
+
+        say $part-rang-misc-$ds-$i
+        alto \
+            'fp://node[(@neclass="MISC"  and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="MISC" ])]' \
+            'tt:%l\t%I' $files \
+            | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
+            > $part-rang-misc-$ds-$i
+
+    done
+done
--- a/oud/xquery/README
+++ b/oud/xquery/README
--- a/oud/xquery/collect.sh.oud
+++ b/oud/xquery/collect.sh.oud
--- a/oud/xquery/items2count.go.oud
+++ b/oud/xquery/items2count.go.oud
--- a/oud/xquery/locaties.xq
+++ b/oud/xquery/locaties.xq
--- a/oud/xquery/new2old.go
+++ b/oud/xquery/new2old.go
--- a/oud/xquery/nieuwe_namen.xq
+++ b/oud/xquery/nieuwe_namen.xq
--- a/oud/xquery/nieuwe_woorden.xq
+++ b/oud/xquery/nieuwe_woorden.xq
--- a/oud/xquery/organisaties.xq
+++ b/oud/xquery/organisaties.xq
--- a/oud/xquery/overige_namen.xq
+++ b/oud/xquery/overige_namen.xq
--- a/oud/xquery/personen.xq
+++ b/oud/xquery/personen.xq
--- a/python/Claude.md
+++ b/python/Claude.md
@@ -0,0 +1,270 @@
+---
+title: Claude
+updated: 2026-06-18 13:20:25Z
+created: 2026-06-18 13:15:32Z
+source: https://claude.ai/share/55700eff-d389-48d5-b1f5-cff59059ce61
+---
+
+> I have collected a dataset of news articles from the past few weeks. I would like to know which words occur significantly more or less frequently in the last week compared to the four weeks before. What are good methods to do this?
+
+There are several good methods for detecting statistically significant shifts in word frequency between two time periods. Here's an overview:
+
+### Corpus Comparison Methods
+
+**1\. Log-Likelihood Ratio (G²)**  
+The most widely used method in corpus linguistics (Dunning, 1993). It compares observed vs. expected frequencies under the null hypothesis that both corpora are drawn from the same distribution. It handles low-frequency words better than chi-squared and works well even when the corpora are very different sizes — which matters here since you're comparing 1 week vs. 4 weeks.
+
+**2\. Chi-Squared (χ²) Test**  
+A simpler alternative, but it's unreliable for low-frequency words (expected count < 5) and can produce many false positives in sparse data. Log-likelihood is generally preferred over it for text.
+
+**3\. Mann-Whitney U / Permutation Tests**  
+Non-parametric tests that make no distributional assumptions. Useful if you want to compare word rates *per article* rather than pooled counts, since they respect article-level variance.
+
+### Effect Size Measures (use alongside significance)
+
+Significance alone doesn't tell you how *much* a word has changed. Pair it with:
+
+- **%DIFF** — simple percentage difference in relative frequency, easy to interpret
+- **Bayes Factor (BIC approximation)** — penalises complexity, good for prioritising the most meaningful differences
+- **Log Ratio** — `log2(freq_week5 / freq_weeks1-4)`, gives a symmetric, interpretable effect size: +1 means doubled, −1 means halved
+
+### Practical Considerations
+
+| Issue | Recommendation |
+| --- | --- |
+| Unequal corpus sizes | Normalize to relative frequencies (per million words) before comparing |
+| Multiple comparisons | Apply Benjamini-Hochberg FDR correction — you're testing thousands of words simultaneously |
+| Low-frequency words | Set a minimum frequency threshold (e.g., appears ≥ 5 times total) to reduce noise |
+| Stop words | Decide whether to include them — they shift too and can be informative (e.g., rising use of "we" vs. "they") |
+| Multiword expressions | Consider bigrams/trigrams if single words feel too decontextualised |
+
+### Recommended Pipeline
+
+1.  **Tokenize and normalize** (lowercase, optional lemmatization)
+2.  **Count frequencies** in each time window
+3.  **Filter** by minimum frequency
+4.  **Run Log-Likelihood** for significance; compute **Log Ratio** for effect size
+5.  **Apply FDR correction** across all words
+6.  **Rank** by effect size among significant words
+
+### Tools & Libraries
+
+- **Python**: `scipy.stats` for chi-squared/G², `statsmodels` for FDR correction, `nltk`/`spacy` for tokenization
+- **R**: the `quanteda` package has `textstat_keyness()` built-in, which implements log-likelihood and several other keyness measures out of the box — very convenient for exactly this use case
+- **Ready-made**: AntConc (free GUI tool) supports keyness analysis directly
+
+Would you like help implementing any of these in code?
+
+> For the %DIFF, do you calculate it as "freq_week5 - freq_weeks1-4" ?
+>
+> Can you show scipy.stats code for chi-squared and G^2 ?
+
+### %DIFF Formula
+
+It's the relative difference in *normalised* frequencies (not raw counts, since your corpora are different sizes):
+
+```
+%DIFF = ((freq_B / size_B) - (freq_A / size_A)) / (freq_A / size_A) * 100
+```
+
+Where A is the reference corpus (weeks 1–4) and B is the target (week 5). So it answers: "by what percentage did the relative frequency go up or down?"
+
+* * *
+
+### Chi-Squared and G² in scipy
+
+```python
+import numpy as np
+from scipy.stats import chi2_contingency, chi2
+
+def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):
+    """
+    word             : the word being tested
+    counts_recent    : raw count in week 5
+    counts_reference : raw count in weeks 1-4
+    total_recent     : total tokens in week 5
+    total_reference  : total tokens in weeks 1-4
+    """
+    a = counts_recent      # word in recent
+    b = counts_reference   # word in reference
+    c = total_recent - a   # non-word in recent
+        d = total_reference - b  # non-word in reference
+
+
+    contingency = np.array([[a, b],
+                                                          [c, d]])
+
+
+    # --- Chi-Squared ---
+        chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)
+
+
+    # --- Log-Likelihood (G²) ---
+    # G² = 2 * sum(observed * log(observed / expected))
+    # scipy's chi2_contingency with lambda_="log-likelihood" computes this
+        g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_="log-likelihood")
+
+
+    # --- Effect sizes ---
+    freq_recent    = a / total_recent
+    freq_reference = b / total_reference
+
+        pct_diff = (freq_recent - freq_reference) / freq_reference * 100
+
+
+    # Avoid log(0) with a small epsilon
+    eps = 1e-9
+        log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))
+
+
+    return {
+        "word":          word,
+        "freq_recent":   freq_recent,
+        "freq_reference": freq_reference,
+        "pct_diff":      pct_diff,
+        "log_ratio":     log_ratio,
+        "chi2":          chi2_stat,
+        "p_chi2":        p_chi2,
+        "g2":            g2_stat,
+        "p_g2":          p_g2,
+    }
+```
+
+A few notes:
+
+- `correction=False` disables Yates' continuity correction for chi-squared, which is standard for large corpora
+- `lambda_="log-likelihood"` is scipy's built-in way to get G²; it uses the same contingency table logic
+- Both tests have **1 degree of freedom** for a 2×2 table, so the p-values are directly comparable
+
+* * *
+
+### Running it over a full vocabulary
+
+```python
+from statsmodels.stats.multitest import multipletests
+
+results = [
+    corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),
+                 total_recent, total_reference)
+    for word in counts_recent
+]
+
+
+# FDR correction across all words
+
+p_values = [r["p_g2"] for r in results]
+
+_, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh")
+
+
+for r, p_adj in zip(results, p_adjusted):
+    r["p_g2_adjusted"] = p_adj
+```
+
+Would you like help loading your articles and building the frequency counts to feed into this?
+
+> And what about tf-idf?
+
+TF-IDF is a slightly different tool — worth understanding the distinction before deciding whether to use it.
+
+### What TF-IDF actually measures
+
+TF-IDF scores a word relative to how *distinctive* it is within a document compared to a collection. The formula is:
+
+```
+TF-IDF(word, doc) = TF(word, doc) × IDF(word, corpus)
+```
+
+- **TF** — how often the word appears in a specific document
+- **IDF** — penalises words that appear in many documents (so "the" scores low)
+
+It was designed for **ranking documents by relevance to a query**, not for comparing two time periods.
+
+### The core problem for your use case
+
+If you treat each time period as one big document, TF-IDF collapses to just IDF — because TF becomes the same for both "documents" and cancels out. You'd just be finding words that are rare across the whole corpus, which isn't what you want.
+
+You *could* treat each article as a document and then aggregate scores by time period, but then you're measuring **within-article distinctiveness**, not **frequency shifts over time**.
+
+### When TF-IDF *is* useful in your pipeline
+
+| Use case | Good fit? |
+| --- | --- |
+| Detecting words that increased/decreased in frequency week-over-week | ✗ Use G² + log ratio instead |
+| Finding the most topically distinctive words *within* a single article | ✓   |
+| Building features for a classifier or clustering articles by topic | ✓   |
+| Summarising what a time period was "about" relative to a background corpus | Partially — see below |
+
+### The partial exception: keyness as pseudo-TF-IDF
+
+Some researchers compute a **keyness score** by treating the recent period as the "document" and the reference period as the "corpus" — which is conceptually similar to TF-IDF. But the statistically rigorous version of this is exactly the log-likelihood + log ratio approach already described. G² is essentially the principled, significance-tested version of what TF-IDF approximates intuitively.
+
+### Bottom line
+
+For your goal — words that shifted significantly between week 5 and weeks 1–4 — G² with log ratio is the right tool. TF-IDF would give you a rough proxy at best and a misleading ranking at worst.
+
+> The corpus_stats() function contains the following comment:
+>
+> `# G² = 2 * sum(observed * log(observed / expected))`
+>
+> Is this the formula to calculate the log-likelihood ratio? What do the observed and expected variables contain?
+
+Yes, that's the correct formula for G². Let me break it down.
+
+### The full formula
+
+```
+G² = 2 × Σ Oᵢ × ln(Oᵢ / Eᵢ)
+```
+
+The sum runs over all **four cells** of the contingency table:
+
+|     | Recent (week 5) | Reference (weeks 1–4) |
+| --- | --- | --- |
+| **Target word** | a   | b   |
+| **All other words** | c   | d   |
+
+### Observed values
+
+Simply the raw counts from your data:
+
+```
+O₁ = a  (word in recent)
+O₂ = b  (word in reference)
+O₃ = c  (all other words in recent)
+O₄ = d  (all other words in reference)
+```
+
+### Expected values
+
+The counts you would *expect* if the word's relative frequency were identical in both corpora — i.e. under the null hypothesis that nothing changed. They're computed from the row and column marginals:
+
+```
+E₁ = (a + b) × (a + c) / N
+E₂ = (a + b) × (b + d) / N
+E₃ = (c + d) × (a + c) / N
+E₄ = (c + d) × (b + d) / N
+```
+
+Where `N = a + b + c + d` is the total number of tokens across both corpora.
+
+### Concrete example
+
+Say your recent corpus has 10,000 tokens and your reference has 40,000 tokens, and the word "crisis" appears 30 times recently and 60 times in the reference:
+
+```
+a = 30,    b = 60
+c = 9970,  d = 39940
+N = 50000
+
+E₁ = (30+60)  × (30+9970)  / 50000 = 90 × 10000 / 50000 = 18
+E₂ = (30+60)  × (60+39940) / 50000 = 90 × 40000 / 50000 = 72
+E₃ = (9970+39940) × 10000  / 50000 = 49910 × 10000 / 50000 = 9982
+E₄ = (9970+39940) × 40000  / 50000 = 49910 × 40000 / 50000 = 39928
+
+G² = 2 × (30×ln(30/18) + 60×ln(60/72) + 9970×ln(9970/9982) + 39940×ln(39940/39928))
+   ≈ 2 × (16.2 − 9.9 − 1.2 + 1.2)
+   ≈ 12.7
+```
+
+"crisis" appears at 3× its expected relative frequency (30/18), and G² = 12.7 exceeds the chi-squared critical value of 3.84 at p < 0.05 with 1 degree of freedom, so the increase is significant.
--- a/python/Claude.pdf
+++ b/python/Claude.pdf
--- a/python/data.txt
+++ b/python/data.txt
--- a/python/namen.html
+++ b/python/namen.html
--- a/python/namen.ipynb
+++ b/python/namen.ipynb
--- a/python/namen.py
+++ b/python/namen.py
@@ -0,0 +1,82 @@
+#!/net/corpora/nlnieuws/notebook/bin/python3
+
+import numpy as np
+from scipy.stats import chi2_contingency
+from statsmodels.stats.multitest import multipletests
+import pandas as pd
+
+# waarom werkt dit niet?
+pd.set_option('display.max_rows', 40)
+
+def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):
+    """
+    word             : the word being tested
+    counts_recent    : raw count in week 5
+    counts_reference : raw count in weeks 1-4
+    total_recent     : total tokens in week 5
+    total_reference  : total tokens in weeks 1-4
+    """
+    a = counts_recent      # word in recent
+    b = counts_reference   # word in reference
+    c = total_recent - a   # non-word in recent
+    d = total_reference - b  # non-word in reference
+
+    contingency = np.array([[a, b],
+                             [c, d]])
+
+    # --- Chi-Squared ---
+    chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)
+
+    # --- Log-Likelihood (G²) ---
+    # G² = 2 * sum(observed * log(observed / expected))
+    # scipy's chi2_contingency with lambda_="log-likelihood" computes this
+    g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_="log-likelihood")
+
+    # --- Effect sizes ---
+    freq_recent    = a / total_recent
+    freq_reference = b / total_reference
+
+    pct_diff = (freq_recent - freq_reference) / freq_reference * 100
+
+    # Avoid log(0) with a small epsilon
+    eps = 1e-9
+    log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))
+
+    return {
+        "word":           word,
+        "freq_recent":    freq_recent,
+        "freq_reference": freq_reference,
+        "pct_diff":       pct_diff,
+        "log_ratio":      log_ratio,
+        "chi2":           chi2_stat,
+        "p_chi2":         p_chi2,
+        "g2":             g2_stat,
+        "p_g2":           p_g2,
+    }
+
+counts_recent = {}
+counts_reference = {}
+with open("data.txt", "rt", encoding="utf-8") as fp:
+    for line in fp:
+        aa = line.split("\t")
+        counts_reference[aa[0]] = max(int(aa[1]), 0.5)
+        counts_recent[aa[0]] = max(int(aa[2]), 0.5)
+total_recent = sum(counts_recent.values())
+total_reference = sum(counts_reference.values())
+
+results = [
+    corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),
+                 total_recent, total_reference)
+    for word in counts_recent]
+
+# FDR correction across all words
+p_values = [r["p_g2"] for r in results]
+_, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh")
+
+for r, p_adj in zip(results, p_adjusted):
+    r["p_g2_adjusted"] = p_adj
+
+results = pd.DataFrame(results)
+print(results)
+print(results.sort_values('g2'))
+print(results.sort_values('pct_diff'))
--- a/python/score.txt
+++ b/python/score.txt
@@ -0,0 +1,30 @@
+Er zijn twee simpele formules om de "effect size" van het verschil tussen twee
+relatieve frequenties te rapporteren:
+ *  %DIFF = (freq_B  - freq_A) / freq_A * 100
+    Percentage verschil in relatieve frequenties, makkelijk te interpreteren,
+    maar niet symmetrisch.
+ *  Log Ratio: log2(freq_A / freq_B)
+    Een symmetrische en interpreteerbare effect size; +1 is een verdubbeling, -1
+    een halvering
+Twee populaire methodes om de significantie van frequentieverschillen te testen
+(ook wel keyword extraction):
+ *  Log-Likelihood Ratio (G^2): meest gebruikte methode in Corpus Linguistics.
+    Vergelijkt observed vs expected frequency.
+ *  Chi-Squared test (X^2): simpeler dan G^2, maar geeft meer false positives
+    bij sparse data, werkt niet goed met lage frequenties.
+Je kunt dan de gebruiker alleen de woorden met significante verschillen laten
+zien (dit zijn dan de keywords). Ik heb met behulp van Claude een notebook in
+elkaar gezet met een demonstratie van deze methodes:
+→ Word freq comparison.ipynb
+
+
+Er zijn ook geavanceerdere methodes die me te ingewikkeld lijken om te
+implementeren, maar ik noem ze voor de volledigheid. In de stylometrie
+is Burrow's Zeta populair, deze is bijv. beschikbaar in Stylo
+https://github.com/computationalstylistics/stylo onder de oppose()
+functie
+https://cran.r-project.org/web/packages/stylo/stylo.pdf#Rfn.oppose.1 .
+Er is ook een methode die gebruik maakt van Bayesiaanse statistiek en
+frequenties uit een achtergrondcorpus, de Fightin' Words methode van
+Monroe et al: https://github.com/jmhessel/FightingWords
+
--- a/python/stijgers.py
+++ b/python/stijgers.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+
+import sys
+import numpy as np
+from scipy.stats import chi2_contingency
+from statsmodels.stats.multitest import multipletests
+import pandas as pd
+
+# waarom werkt dit niet?
+pd.set_option('display.max_rows', 40)
+pd.set_option('display.max_columns', None)
+
+def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):
+    """
+    word             : the word being tested
+    counts_recent    : raw count in week 5
+    counts_reference : raw count in weeks 1-4
+    total_recent     : total tokens in week 5
+    total_reference  : total tokens in weeks 1-4
+    """
+    a = counts_recent      # word in recent
+    b = counts_reference   # word in reference
+    c = total_recent - a   # non-word in recent
+    d = total_reference - b  # non-word in reference
+
+    contingency = np.array([[a, b],
+                             [c, d]])
+
+    # --- Chi-Squared ---
+    chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)
+
+    # --- Log-Likelihood (G²) ---
+    # G² = 2 * sum(observed * log(observed / expected))
+    # scipy's chi2_contingency with lambda_="log-likelihood" computes this
+    g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_="log-likelihood")
+
+    # --- Effect sizes ---
+    freq_recent    = a / total_recent
+    freq_reference = b / total_reference
+
+    pct_diff = (freq_recent - freq_reference) / freq_reference * 100
+
+    # Avoid log(0) with a small epsilon
+    eps = 1e-9
+    log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))
+
+    return {
+        "word":           word,
+        "freq_recent":    freq_recent,
+        "freq_reference": freq_reference,
+        "pct_diff":       pct_diff,
+        "log_ratio":      log_ratio,
+        "chi2":           chi2_stat,
+        "p_chi2":         p_chi2,
+        "g2":             g2_stat,
+        "p_g2":           p_g2,
+    }
+
+oudfile = sys.argv[1]
+nieuwfile = sys.argv[2]
+textfile = sys.argv[3]
+
+counts_recent = {}
+counts_reference = {}
+
+with open(oudfile, "rt", encoding="utf-8") as fp:
+    for line in fp:
+        aa = line.split("\t")
+        counts_reference[aa[1].strip()] = int(aa[0])
+with open(nieuwfile, "rt", encoding="utf-8") as fp:
+    for line in fp:
+        aa = line.split("\t")
+        counts_recent[aa[1].strip()] = int(aa[0])
+
+for key in counts_recent:
+    if not key in counts_reference:
+        counts_reference[key] = 0.5
+for key in counts_reference:
+    if not key in counts_recent:
+        counts_recent[key] = 0.5
+
+total_recent = sum(counts_recent.values())
+total_reference = sum(counts_reference.values())
+
+results = [
+    corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),
+                 total_recent, total_reference)
+    for word in counts_recent]
+
+# FDR correction across all words
+p_values = [r["p_g2"] for r in results]
+_, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh")
+
+for r, p_adj in zip(results, p_adjusted):
+    r["p_g2_adjusted"] = p_adj
+
+results = pd.DataFrame(results)
+
+with open(textfile, "wt", encoding="utf-8") as fp:
+    fp.write("STIJGERS\n")
+    fp.write(results[results.pct_diff > 0].sort_values('g2', ascending=False)[:40].to_string())
+    fp.write("\n\nDALERS\n")
+    fp.write(results[results.pct_diff < 0].sort_values('g2', ascending=False)[:40].to_string())
+    fp.write("\n")
--- a/python/stijgers2json.py
+++ b/python/stijgers2json.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+
+import json
+import sys
+import numpy as np
+from scipy.stats import chi2_contingency
+from statsmodels.stats.multitest import multipletests
+
+def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):
+    """
+    word             : the word being tested
+    counts_recent    : raw count in week 5
+    counts_reference : raw count in weeks 1-4
+    total_recent     : total tokens in week 5
+    total_reference  : total tokens in weeks 1-4
+    """
+    a = counts_recent      # word in recent
+    b = counts_reference   # word in reference
+    c = total_recent - a   # non-word in recent
+    d = total_reference - b  # non-word in reference
+
+    contingency = np.array([[a, b],
+                             [c, d]])
+
+    # --- Chi-Squared ---
+    chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)
+
+    # --- Log-Likelihood (G²) ---
+    # G² = 2 * sum(observed * log(observed / expected))
+    # scipy's chi2_contingency with lambda_="log-likelihood" computes this
+    g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_="log-likelihood")
+
+    # --- Effect sizes ---
+    freq_recent    = a / total_recent
+    freq_reference = b / total_reference
+
+    pct_diff = (freq_recent - freq_reference) / freq_reference * 100
+
+    # Avoid log(0) with a small epsilon
+    eps = 1e-9
+    log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))
+
+    return {
+        "word":           word,
+        "freq_recent":    freq_recent,
+        "freq_reference": freq_reference,
+        "pct_diff":       pct_diff,
+        "log_ratio":      log_ratio,
+        "chi2":           chi2_stat,
+        "p_chi2":         p_chi2,
+        "g2":             g2_stat,
+        "p_g2":           p_g2,
+    }
+
+oudfile = sys.argv[1]
+nieuwfile = sys.argv[2]
+jsonfile = sys.argv[3]
+
+counts_recent = {}
+counts_reference = {}
+
+with open(oudfile, "rt", encoding="utf-8") as fp:
+    for line in fp:
+        aa = line.split("\t")
+        counts_reference[aa[1].strip()] = int(aa[0])
+with open(nieuwfile, "rt", encoding="utf-8") as fp:
+    for line in fp:
+        aa = line.split("\t")
+        counts_recent[aa[1].strip()] = int(aa[0])
+
+for key in counts_recent:
+    if not key in counts_reference:
+        counts_reference[key] = 0.5
+for key in counts_reference:
+    if not key in counts_recent:
+        counts_recent[key] = 0.5
+
+total_recent = sum(counts_recent.values())
+total_reference = sum(counts_reference.values())
+
+results = [
+    corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),
+                 total_recent, total_reference)
+    for word in counts_recent]
+
+# FDR correction across all words
+p_values = [r["p_g2"] for r in results]
+_, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh")
+
+for r, p_adj in zip(results, p_adjusted):
+    r["p_g2_adjusted"] = p_adj
+
+o = {}
+#o['up'] = sorted([x for x in results if x['p_g2'] < .05 and x['pct_diff'] > 0], key=lambda x: x['g2'], reverse=True)[:40]
+#o['dn'] = sorted([x for x in results if x['p_g2'] < .05 and x['pct_diff'] < 0], key=lambda x: x['g2'], reverse=True)[:40]
+o['up'] = sorted([x for x in results if x['pct_diff'] > 0], key=lambda x: x['g2'], reverse=True)[:40]
+o['dn'] = sorted([x for x in results if x['pct_diff'] < 0], key=lambda x: x['g2'], reverse=True)[:40]
+with open(jsonfile, "wt", encoding="utf-8") as fp:
+    json.dump(o, fp)
--- a/python/word_freq_comparison.html
+++ b/python/word_freq_comparison.html
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Peter Kleiweg	c2389c65af	stijgers2json.py	2026-06-19 17:53:03 +02:00
Peter Kleiweg	802a84a929	collect.sh: stijgers	2026-06-19 13:41:28 +02:00
Peter Kleiweg	8289072a52	weekadd	2026-06-19 12:59:45 +02:00
Peter Kleiweg	e0550197e3	meer stijgers	2026-06-18 14:49:00 +02:00
Peter Kleiweg	01e6d48665	stijgers	2026-06-18 12:52:40 +02:00
Peter Kleiweg	a8bea0ab44	updates	2026-06-10 13:31:55 +02:00
Peter Kleiweg	d7adc17d4b	Volkskrant: columns hebben geen intro	2026-06-09 18:37:19 +02:00
Peter Kleiweg	a9f9e17acf	gone gone	2026-06-06 21:05:00 +02:00
Peter Kleiweg	1f4a084624	gone, trends	2026-06-06 17:10:38 +02:00
Peter Kleiweg	9f29222909	ranglijsten	2026-06-06 14:46:01 +02:00
Peter Kleiweg	a76fa21584	update	2026-06-05 16:05:46 +02:00
Peter Kleiweg	efa301cc4a	Tzum: <em>Titel</em> -> "Titel"	2026-06-03 17:32:01 +02:00
Peter Kleiweg	14590570ba	fix voor zinnen aan elkaar	2026-05-29 17:22:10 +02:00
Peter Kleiweg	ca4e7af8fa	tags; .De -> . De	2026-05-29 12:22:57 +02:00
Peter Kleiweg	66581d4e98	data2json.go: aangepast aan grote reorganisatie	2026-05-28 02:59:46 +02:00
Peter Kleiweg	e53049e62f	top20.go: aangepast aan grote reorganisatie	2026-05-28 02:36:55 +02:00
Peter Kleiweg	7f23212fc3	app: checkmark bij datumkiezer	2026-05-27 23:19:30 +02:00
Peter Kleiweg	5c651387af	grote reorganisatie: - HLN, NOS, NU, VRT: per week -> per dag - yyyy-ww -> yyyy.ww - yyyy* -> yyyy/yyyy* etc	2026-05-27 22:42:03 +02:00
Peter Kleiweg	e430ff576b	update readme	2026-05-25 12:37:55 +02:00
Peter Kleiweg	9d82f11536	README	2026-05-24 19:13:09 +02:00
Peter Kleiweg	650a13eb4a	weg: cmd/score (2)	2026-05-24 16:44:26 +02:00
Peter Kleiweg	bf0407b933	weg: cmd/score	2026-05-24 16:42:12 +02:00
Peter Kleiweg	fcad105a75	helperfuncties naar internal/util (2)	2026-05-24 16:29:56 +02:00
Peter Kleiweg	75832c3132	fix title voor HLN en Parool; helperfuncties naar internal/util	2026-05-24 16:16:21 +02:00