diff --git a/.gitignore b/.gitignore
index 969f0cf..07f4f15 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,9 @@ AT5/metadata
AT5/xml2txt
GG/gg
GG/metadata
+LitNL/litnl
+LitNL/metadata
+LitNL/xml2txt
NieuwsNL/metadata
NieuwsNL/nieuwsnl
NOS/json2txt
diff --git a/LitNL/Makefile b/LitNL/Makefile
new file mode 100644
index 0000000..9a6b040
--- /dev/null
+++ b/LitNL/Makefile
@@ -0,0 +1,13 @@
+all: \
+ xml2txt \
+ metadata \
+ litnl
+
+xml2txt: cmd/xml2txt/*.go
+ go build -o $@ $^
+
+metadata: cmd/metadata/*.go
+ go build -o $@ $^
+
+litnl: cmd/litnl/*.go
+ go build -o $@ $^
diff --git a/LitNL/cmd/litnl/litnl.go b/LitNL/cmd/litnl/litnl.go
new file mode 100644
index 0000000..56aa5b2
--- /dev/null
+++ b/LitNL/cmd/litnl/litnl.go
@@ -0,0 +1,123 @@
+package main
+
+import (
+ e "codeberg.org/pebbe/errors"
+
+ "encoding/xml"
+ "fmt"
+ "io"
+ "net/http"
+ "net/url"
+ "os"
+ "path/filepath"
+ "strings"
+ "time"
+)
+
+type Rss struct {
+ XMLName xml.Name `xml:"rss"`
+ Items []ItemT `xml:"channel>item"`
+}
+
+type ItemT struct {
+ PubDate string `xml:"pubDate"`
+ UnixTime int64 `xml:"unixTime"`
+ Guid string `xml:"guid"`
+ Link string `xml:"link"`
+ Data []byte `xml:",innerxml"`
+}
+
+var (
+ p = e.PanicErr
+ agent = "AhrefsBot/7.0"
+)
+
+func exists(filename string) bool {
+ _, err := os.Stat(filename)
+ return err == nil
+}
+
+func main() {
+ defer func() {
+ if e.Panicked {
+ _ = recover()
+ os.Exit(1)
+ }
+ }()
+
+ myLock := "/net/corpora/nlnieuws/LitNL/lock"
+ mkLock(myLock)
+ defer func() {
+ _ = os.Remove(myLock)
+ }()
+
+ req, err := http.NewRequest("GET", "https://www.literairnederland.nl/feed/", nil)
+ p(err)
+ req.Header.Set("User-Agent", agent)
+
+ client := &http.Client{}
+ resp, err := client.Do(req)
+ p(err)
+ body, err := io.ReadAll(resp.Body)
+ p(err)
+ p(resp.Body.Close())
+
+ var rss Rss
+ p(xml.Unmarshal(body, &rss))
+
+ if len(rss.Items) == 0 {
+ p(fmt.Errorf("len(rss.Items) == 0"))
+ }
+
+ for _, item := range rss.Items {
+ t, err := time.Parse(time.RFC1123Z, item.PubDate)
+ if err != nil {
+ t, err = time.Parse(time.RFC1123, item.PubDate)
+ }
+ p(err)
+ year, week := t.ISOWeek()
+ dirname := fmt.Sprintf("/net/corpora/nlnieuws/LitNL/%d/%02d", year, week)
+ if exists(dirname + "/lock") {
+ continue
+ }
+ basename := strings.TrimPrefix(item.Guid, "https://www.literairnederland.nl/?p=")
+ filename := dirname + "/" + url.PathEscape(basename)
+
+ p(os.MkdirAll(dirname, 0777))
+ func() {
+ var ok bool
+ defer func() {
+ if e.Panicked {
+ fmt.Fprintln(os.Stderr, "----", filename)
+ fmt.Fprintln(os.Stderr, "----", item.Link)
+ }
+ if !ok {
+ _ = os.Remove(filename + ".xml")
+ }
+ }()
+ fp, err := os.Create(filename + ".xml")
+ p(err)
+ p(fp.WriteString("\n- \n"))
+ p(fmt.Fprintf(fp, "%d", t.Unix()))
+ p(fp.Write(item.Data))
+ p(fp.WriteString("
\n"))
+ p(fp.Close())
+ p(os.Chtimes(filename+".xml", t, t))
+ ok = true
+ }()
+ }
+
+}
+
+func mkLock(filename string) {
+ pid := os.Getpid()
+ link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
+ p(os.Symlink(link, filename))
+
+ name, err := os.Readlink(filename)
+ p(err)
+
+ if name != link {
+ p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
+ }
+}
diff --git a/LitNL/cmd/metadata/metadata.go b/LitNL/cmd/metadata/metadata.go
new file mode 100644
index 0000000..95e3d9d
--- /dev/null
+++ b/LitNL/cmd/metadata/metadata.go
@@ -0,0 +1,131 @@
+package main
+
+import (
+ e "codeberg.org/pebbe/errors"
+
+ "bufio"
+ "encoding/xml"
+ "fmt"
+ "html"
+ "os"
+ "strings"
+ "time"
+)
+
+type Item struct {
+ XMLName xml.Name `xml:"item"`
+ UnixTime int64 `xml:"unixTime"`
+}
+
+var (
+ x = e.ExitErr
+ escape = html.EscapeString
+ data = make(map[string][]string)
+ location *time.Location
+)
+
+func main() {
+ var err error
+ location, err = time.LoadLocation("Europe/Amsterdam")
+ x(err)
+
+ files, err := os.ReadDir(".")
+ x(err)
+ for _, file := range files {
+ filename := file.Name()
+ if strings.HasSuffix(filename, ".txt") {
+ doText("", filename)
+ } else if strings.HasSuffix(filename, ".xml") {
+ doXml("", filename)
+ }
+ }
+ files, err = os.ReadDir("..")
+ x(err)
+ for _, file := range files {
+ filename := file.Name()
+ if strings.HasSuffix(filename, ".txt") {
+ doText("../", filename)
+ } else if strings.HasSuffix(filename, ".xml") {
+ doXml("../", filename)
+ }
+ }
+
+ files, err = os.ReadDir("xml")
+ x(err)
+ for _, file := range files {
+ filename := file.Name()
+ if !strings.HasSuffix(filename, ".xml") {
+ continue
+ }
+ aa := strings.Split(filename, ".")
+ base := strings.Join(aa[1:len(aa)-2], ".")
+ b, err := os.ReadFile("xml/" + filename)
+ x(err)
+ s := string(b)
+ i := strings.Index(s, "\n \n"))
+ for _, m := range data[base] {
+ x(fp.WriteString(" " + m + "\n"))
+ }
+ x(fp.WriteString(" \n "))
+ x(fp.WriteString(stripMeta(s[i:])))
+ x(fp.Close())
+ x(os.Rename("xml/"+filename+".tmp", "xml/"+filename))
+ }
+}
+
+func doText(dirname, filename string) {
+ base := filename[:len(filename)-4]
+ if _, ok := data[base]; !ok {
+ data[base] = make([]string, 0)
+ }
+ fp, err := os.Open(dirname + filename)
+ x(err)
+ defer func() { x(fp.Close()) }()
+ scanner := bufio.NewScanner(fp)
+ for scanner.Scan() {
+ line := scanner.Text()
+ if !strings.HasPrefix(line, "##META") {
+ continue
+ }
+ aa := strings.Fields(line)
+ if len(aa) > 4 {
+ data[base] = append(data[base],
+ fmt.Sprintf(``,
+ aa[1],
+ escape(aa[2]),
+ escape(strings.Join(aa[4:], " "))))
+ }
+ }
+ x(scanner.Err())
+}
+
+func doXml(dirname, filename string) {
+ base := filename[:len(filename)-4]
+ if _, ok := data[base]; !ok {
+ data[base] = make([]string, 0)
+ }
+ b, err := os.ReadFile(dirname + filename)
+ x(err)
+ var item Item
+ x(xml.Unmarshal(b, &item))
+ t := time.Unix(item.UnixTime, 0).In(location)
+ data[base] = append(data[base],
+ fmt.Sprintf(``,
+ t.Year(),
+ int(t.Month()),
+ t.Day()))
+}
+
+func stripMeta(s string) string {
+ i1 := strings.Index(s, "")
+ if i1 < 0 {
+ return s
+ }
+ i2 := i1 + strings.Index(s[i1:], "") + 11
+ return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n")
+}
diff --git a/LitNL/cmd/xml2txt/xml2txt.go b/LitNL/cmd/xml2txt/xml2txt.go
new file mode 100644
index 0000000..123c31c
--- /dev/null
+++ b/LitNL/cmd/xml2txt/xml2txt.go
@@ -0,0 +1,105 @@
+package main
+
+import (
+ e "codeberg.org/pebbe/errors"
+ "github.com/jbowtie/gokogiri"
+
+ "encoding/xml"
+ "fmt"
+ "os"
+ "regexp"
+ "strings"
+ "time"
+)
+
+type Item struct {
+ Title string `xml:"title"`
+ Text string `xml:"encoded"`
+ Cats []string `xml:"category"`
+}
+
+var (
+ w = e.WarnErr
+ x = e.ExitErr
+
+ reYearWeek = regexp.MustCompile(`^2[0-9][0-9][0-9]-[0-5][0-9]$`)
+)
+
+func main() {
+
+ var ds string
+ switch len(os.Args) {
+ case 1:
+ year, week := time.Now().AddDate(0, 0, -7).ISOWeek()
+ ds = fmt.Sprintf("%d-%02d", year, week)
+ case 2:
+ if !reYearWeek.MatchString(os.Args[1]) {
+ x(fmt.Errorf("arg must be yyyy-ww"))
+ }
+ ds = os.Args[1]
+ default:
+ x(fmt.Errorf("too many arguments"))
+ }
+ dp := ds[:4] + "/" + ds[5:]
+
+ x(os.Chdir("/net/corpora/nlnieuws/LitNL/" + dp))
+ x(os.MkdirAll("out", 0777))
+ files, err := os.ReadDir(".")
+ x(err)
+ for _, file := range files {
+ filename := file.Name()
+ if !strings.HasSuffix(filename, ".xml") {
+ continue
+ }
+ b, err := os.ReadFile(filename)
+ x(err)
+ fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt")
+ x(err)
+ var item Item
+ x(xml.Unmarshal(b, &item))
+ for _, cat := range item.Cats {
+ x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat)))
+ }
+ x(fp.WriteString(addEnd(fixSpace(item.Title))))
+ doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``))
+ x(err)
+ root := doc.Root()
+ pp, err := root.Search(`//body//p`)
+ x(err)
+ if len(pp) == 0 {
+ pp, err = root.Search(`//body`)
+ x(err)
+ }
+ if len(pp) == 0 {
+ _ = w(fmt.Errorf("empty: %s", filename))
+ }
+ for _, p := range pp {
+ x(fp.WriteString(addEnd(fixSpace(p.Content()))))
+ }
+ x(fp.Close())
+ }
+}
+
+func addEnd(s string) string {
+ s = strings.TrimSpace(s)
+ n := len(s)
+ if n == 0 {
+ return ""
+ }
+ if n > 0 {
+ if strings.ContainsAny(s[n-1:], ".!?") {
+ return s + "\n"
+ }
+ }
+ if n > 1 {
+ s2 := s[n-2:]
+ if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
+ return s + "\n"
+ }
+ }
+ return s + ".\n"
+}
+
+func fixSpace(s string) string {
+ return strings.Join(strings.Fields(s), " ")
+}
diff --git a/LitNL/txt2corpus.sh b/LitNL/txt2corpus.sh
new file mode 100755
index 0000000..3df94b5
--- /dev/null
+++ b/LitNL/txt2corpus.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+set -e
+
+unset CDPATH
+PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
+export TZ=Europe/Amsterdam
+. /net/aps/etc/alpino-activate.sh > /dev/null
+
+if [ "$1" = "" ]
+then
+ ds=`ISOWeek -7`
+else
+ case "$1" in
+ 2[0-9][0-9][0-9]-[0-5][0-9])
+ ds=$1
+ ;;
+ *)
+ echo INVALID
+ exit 1
+ ;;
+ esac
+fi
+
+dp=${ds//-//}
+
+corpus=/net/corpora/nlnieuws/LitNL/corpus/$ds
+
+cd /net/corpora/nlnieuws/LitNL/$dp
+
+ln -s lock.$$ lock
+if [ "`readlink lock`" != lock.$$ ]
+then
+ echo Getting lock failed
+ exit 1
+fi
+
+rm -fr out
+mkdir out
+
+../../xml2txt $ds
+
+rm -f $corpus.lines
+for i in out/*.txt
+do
+ b=`basename $i .txt`
+ perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \
+ | perl -e '$n = 0; while(<>) { $n++; print("litnl.'$b'.$n|$_"); }' \
+ >> $corpus.lines
+done
+
+cd out
+mkdir xml
+Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
+
+../../../metadata
+
+cd xml
+rm -f $corpus.data.dz $corpus.index
+alto -q -o $corpus.data.dz *.xml
+
+# telling per bericht, niet per zin
+/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
+
+cd ../..
+rm -fr out
+
+rm -f lock
diff --git a/Makefile b/Makefile
index f2159fa..d71031b 100644
--- a/Makefile
+++ b/Makefile
@@ -2,6 +2,7 @@
all:
make -C AT5
make -C GG
+ make -C LitNL
make -C NieuwsNL
make -C NOS
make -C NU
diff --git a/collect.sh b/collect.sh
index 3f916b0..18d6f29 100755
--- a/collect.sh
+++ b/collect.sh
@@ -41,12 +41,16 @@ declare -A parts
parts[alles]='.'
parts[algemeen]='NOS|NU|NieuwsNL|RO|Sargasso'
parts[groningen]='GG|Oog|RTVNoord|Sikkom'
+parts[literatuur]='LitNL|Tzum'
parts[AT5]='AT5'
parts[GG]='GG'
+parts[LitNL]='LitNL'
parts[NOS]='NOS'
parts[NU]='NU'
parts[NieuwsNL]='NieuwsNL'
+parts[Oog]='Oog'
parts[RO]='RO'
+parts[RTVNoord]='RTVNoord'
parts[Sargasso]='Sargasso'
parts[Sikkom]='Sikkom'
parts[Tzum]='Tzum'