diff --git a/.gitignore b/.gitignore
index 9a7719b..2365536 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,8 @@ BuurtGrn/buurtgrn
BuurtGrn/metadata
GG/gg
GG/metadata
+HLN/metadata
+HLN/hln
LitNL/litnl
LitNL/metadata
LitNL/xml2txt
diff --git a/HLN/Makefile b/HLN/Makefile
new file mode 100644
index 0000000..911b43f
--- /dev/null
+++ b/HLN/Makefile
@@ -0,0 +1,9 @@
+all: \
+ metadata \
+ hln
+
+metadata: cmd/metadata/*.go
+ go build -o $@ $^
+
+hln: cmd/hln/*.go
+ go build -o $@ $^
diff --git a/HLN/cmd/hln/hln.go b/HLN/cmd/hln/hln.go
new file mode 100644
index 0000000..b205c53
--- /dev/null
+++ b/HLN/cmd/hln/hln.go
@@ -0,0 +1,338 @@
+package main
+
+import (
+ e "codeberg.org/pebbe/errors"
+ "github.com/jbowtie/gokogiri"
+
+ "encoding/xml"
+ "fmt"
+ "io"
+ "net/http"
+ "net/url"
+ "os"
+ "path/filepath"
+ "strings"
+ "time"
+)
+
+type Rss struct {
+ XMLName xml.Name `xml:"rss"`
+ Items []ItemT `xml:"channel>item"`
+}
+
+type ItemT struct {
+ PubDate string `xml:"pubDate"`
+ UnixTime int64 `xml:"unixTime"`
+ Guid string `xml:"guid"`
+ Link string `xml:"link"`
+ Data []byte `xml:",innerxml"`
+}
+
+var (
+ p = e.PanicErr
+ w = e.WarnErr
+ agent = "AhrefsBot/7.0"
+)
+
+func exists(filename string) bool {
+ _, err := os.Stat(filename)
+ return err == nil
+}
+
+func fileDate(filename string) string {
+ b, err := os.ReadFile(filename)
+ if err != nil {
+ return ""
+ }
+ s := string(b)
+ i1 := strings.Index(s, "") + 10
+ i2 := strings.Index(s, "")
+ if i2 < i1 {
+ return ""
+ }
+ return s[i1:i2]
+}
+
+func main() {
+ defer func() {
+ if e.Panicked {
+ _ = recover()
+ os.Exit(1)
+ }
+ }()
+
+ myLock := "/net/corpora/nlnieuws/HLN/lock"
+ mkLock(myLock)
+ defer func() {
+ _ = os.Remove(myLock)
+ }()
+
+ req, err := http.NewRequest("GET", "https://www.hln.be/home/rss.xml", nil)
+ p(err)
+ req.Header.Set("User-Agent", agent)
+
+ client := &http.Client{}
+ resp, err := client.Do(req)
+ p(err)
+ body, err := io.ReadAll(resp.Body)
+ p(err)
+ p(resp.Body.Close())
+
+ var rss Rss
+ p(xml.Unmarshal(body, &rss))
+
+ if len(rss.Items) == 0 {
+ p(fmt.Errorf("len(rss.Items) == 0"))
+ }
+
+ for _, item := range rss.Items {
+ t, err := time.Parse(time.RFC1123Z, item.PubDate)
+ if err != nil {
+ t, err = time.Parse(time.RFC1123, item.PubDate)
+ }
+ p(err)
+ year, week := t.ISOWeek()
+ dirname := fmt.Sprintf("/net/corpora/nlnieuws/HLN/%d/%02d", year, week)
+ if exists(dirname + "/lock") {
+ continue
+ }
+ basename := strings.TrimPrefix(item.Guid, "https://www.hln.be/")
+ basename = strings.TrimSuffix(basename, "/")
+ if n, i := len(basename), strings.Index(basename, "~"); i < n-1 && i > 0 {
+ basename = basename[i+1:]
+ }
+
+ filename := dirname + "/" + url.PathEscape(basename)
+
+ ts := fmt.Sprintf("%d", t.Unix())
+ needUpdate := fileDate(filename+".xml") != ts
+
+ p(os.MkdirAll(dirname, 0777))
+ func() {
+ var ok bool
+ defer func() {
+ if e.Panicked {
+ fmt.Fprintln(os.Stderr, "----", filename)
+ fmt.Fprintln(os.Stderr, "----", item.Link)
+ }
+ if !ok {
+ _ = os.Remove(filename + ".xml")
+ }
+ }()
+ fp, err := os.Create(filename + ".xml")
+ p(err)
+ p(fp.WriteString("\n- \n"))
+ p(fmt.Fprintf(fp, "%d", t.Unix()))
+ p(fp.Write(item.Data))
+ p(fp.WriteString("
\n"))
+ p(fp.Close())
+ p(os.Chtimes(filename+".xml", t, t))
+ ok = doArticle(filename, item.Link, t, needUpdate)
+ }()
+ }
+}
+
+func doArticle(filename string, url string, timestamp time.Time, needUpdate bool) (ok bool) {
+ if exists(filename + ".skip") {
+ return true
+ }
+ if needUpdate {
+ _ = os.Remove(filename + ".err")
+ _ = os.Remove(filename + ".html")
+ _ = os.Remove(filename + ".txt")
+ } else {
+ if exists(filename + ".txt") {
+ return true
+ }
+ }
+ time.Sleep(2 * time.Second)
+
+ req, err := http.NewRequest("GET", url, nil)
+ p(err)
+ req.Header.Set("User-Agent", agent)
+
+ client := &http.Client{}
+ resp, err := client.Do(req)
+ p(err)
+ body, err := io.ReadAll(resp.Body)
+ p(err)
+ p(resp.Body.Close())
+
+ doc, err := gokogiri.ParseHtml(body)
+ p(err)
+
+ root := doc.Root()
+
+ articles, err := root.Search(`//article[@id="article-content"]`)
+ p(err)
+ if len(articles) == 0 {
+ _ = w(fmt.Errorf("empty: %s", url))
+
+ fp, err := os.Create(filename + ".err")
+ p(err)
+ p(fmt.Fprintf(fp, "empty: %s\n", url))
+ p(fp.Close())
+ p(os.Chtimes(filename+".err", timestamp, timestamp))
+
+ fp, err = os.Create(filename + ".html")
+ p(err)
+ p(fp.Write(body))
+ p(fp.Close())
+ p(os.Chtimes(filename+".html", timestamp, timestamp))
+
+ return false
+ }
+ article := articles[0]
+
+ tags := make([]string, 0)
+ ell, err := article.Search(`//*[@data-content-type="LABEL"]`)
+ p(err)
+ if len(ell) == 0 {
+ _ = w(fmt.Errorf("no labels: %s", url))
+ }
+ for _, el := range ell {
+ s := strings.TrimSpace(el.Content())
+ if s != "" {
+ tags = append(tags, s)
+ }
+ }
+
+ pars := make([]string, 0)
+
+ ell, err = article.Search(`//*[@data-content-type="TITLE"]`)
+ p(err)
+ if len(ell) != 1 {
+ _ = w(fmt.Errorf("found %d titles: %s", len(ell), url))
+ }
+ for _, el := range ell {
+ s := strings.TrimSpace(el.Content())
+ if s != "" {
+ pars = append(pars, s)
+ }
+ }
+
+ found := false
+ ell, err = article.Search(`//*[@data-content-type="INTRO"]`)
+ p(err)
+ for _, el := range ell {
+ s := strings.TrimSpace(el.Content())
+ if s != "" {
+ pars = append(pars, s)
+ found = true
+ }
+ }
+ if !found {
+ _ = w(fmt.Errorf("no intro: %s", url))
+ }
+
+ specials, err := article.Search(`//*[@data-content-type="GROUP"]`)
+ p(err)
+ for _, special := range specials {
+ special.Remove()
+ }
+
+ ell, err = article.Search(`//*[@data-content-type="PARAGRAPH"]`)
+ p(err)
+ if len(ell) == 0 {
+ _ = w(fmt.Errorf("no paragraphs: %s", url))
+
+ fp, err := os.Create(filename + ".err")
+ p(err)
+ p(fmt.Fprintf(fp, "no paragraphs: %s\n", url))
+ p(fp.Close())
+ p(os.Chtimes(filename+".err", timestamp, timestamp))
+
+ fp, err = os.Create(filename + ".html")
+ p(err)
+ p(fp.Write(body))
+ p(fp.Close())
+ p(os.Chtimes(filename+".html", timestamp, timestamp))
+
+ return false
+ }
+
+ found = false
+ for _, el := range ell {
+ s := strings.TrimSpace(el.Content())
+ if s != "" {
+ pars = append(pars, s)
+ found = true
+ }
+ }
+ if !found {
+ _ = w(fmt.Errorf("no text, skipping: %s", url))
+ fp, err := os.Create(filename + ".skip")
+ p(fp.WriteString(url + "\n"))
+ p(err)
+ p(os.Chtimes(filename+".skip", timestamp, timestamp))
+
+ fp, err = os.Create(filename + ".html")
+ p(err)
+ p(fp.Write(body))
+ p(fp.Close())
+ p(os.Chtimes(filename+".html", timestamp, timestamp))
+
+ return true
+ }
+
+ fp, err := os.Create(filename + ".txt")
+ p(err)
+
+ if len(tags) == 0 {
+ p(fmt.Fprintln(fp, "##META text tag ="))
+ } else {
+ for _, tag := range tags {
+ p(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag)))
+ }
+ }
+
+ for _, par := range pars {
+ p(fp.WriteString(addEnd(fixSpace(par))))
+ }
+
+ p(fp.Close())
+
+ p(os.Chtimes(filename+".txt", timestamp, timestamp))
+
+ return true
+}
+
+func addEnd(s string) string {
+ s = strings.TrimSpace(s)
+ n := len(s)
+ if n == 0 {
+ return ""
+ }
+ if n > 0 {
+ if strings.ContainsAny(s[n-1:], ".!?") {
+ return s + "\n"
+ }
+ }
+ if n > 1 {
+ s2 := s[n-2:]
+ if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
+ return s + "\n"
+ }
+ }
+ if strings.HasSuffix(s, `.”`) || strings.HasSuffix(s, `!”`) || strings.HasSuffix(s, `?”`) {
+ return s + "\n"
+ }
+ return s + ".\n"
+}
+
+func fixSpace(s string) string {
+ return strings.Join(strings.Fields(s), " ")
+}
+
+func mkLock(filename string) {
+ pid := os.Getpid()
+ link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
+ p(os.Symlink(link, filename))
+
+ name, err := os.Readlink(filename)
+ p(err)
+
+ if name != link {
+ p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
+ }
+}
diff --git a/HLN/cmd/metadata/metadata.go b/HLN/cmd/metadata/metadata.go
new file mode 100644
index 0000000..18ff292
--- /dev/null
+++ b/HLN/cmd/metadata/metadata.go
@@ -0,0 +1,131 @@
+package main
+
+import (
+ e "codeberg.org/pebbe/errors"
+
+ "bufio"
+ "encoding/xml"
+ "fmt"
+ "html"
+ "os"
+ "strings"
+ "time"
+)
+
+type Item struct {
+ XMLName xml.Name `xml:"item"`
+ UnixTime int64 `xml:"unixTime"`
+}
+
+var (
+ x = e.ExitErr
+ escape = html.EscapeString
+ data = make(map[string][]string)
+ location *time.Location
+)
+
+func main() {
+ var err error
+ location, err = time.LoadLocation("Europe/Amsterdam")
+ x(err)
+
+ files, err := os.ReadDir(".")
+ x(err)
+ for _, file := range files {
+ filename := file.Name()
+ if strings.HasSuffix(filename, ".txt") {
+ doText("", filename)
+ } else if strings.HasSuffix(filename, ".xml") {
+ doXml("", filename)
+ }
+ }
+ files, err = os.ReadDir("..")
+ x(err)
+ for _, file := range files {
+ filename := file.Name()
+ if strings.HasSuffix(filename, ".txt") {
+ doText("../", filename)
+ } else if strings.HasSuffix(filename, ".xml") {
+ doXml("../", filename)
+ }
+ }
+
+ files, err = os.ReadDir("xml")
+ x(err)
+ for _, file := range files {
+ filename := file.Name()
+ if !strings.HasSuffix(filename, ".xml") {
+ continue
+ }
+ aa := strings.Split(filename, ".")
+ base := strings.Join(aa[1:len(aa)-2], ".")
+ b, err := os.ReadFile("xml/" + filename)
+ x(err)
+ s := string(b)
+ i := strings.Index(s, "\n \n"))
+ for _, m := range data[base] {
+ x(fp.WriteString(" " + m + "\n"))
+ }
+ x(fp.WriteString(" \n "))
+ x(fp.WriteString(stripMeta(s[i:])))
+ x(fp.Close())
+ x(os.Rename("xml/"+filename+".tmp", "xml/"+filename))
+ }
+}
+
+func doText(dirname, filename string) {
+ base := filename[:len(filename)-4]
+ if _, ok := data[base]; !ok {
+ data[base] = make([]string, 0)
+ }
+ fp, err := os.Open(dirname + filename)
+ x(err)
+ defer func() { x(fp.Close()) }()
+ scanner := bufio.NewScanner(fp)
+ for scanner.Scan() {
+ line := scanner.Text()
+ if !strings.HasPrefix(line, "##META") {
+ continue
+ }
+ aa := strings.Fields(line)
+ if len(aa) > 4 {
+ data[base] = append(data[base],
+ fmt.Sprintf(``,
+ aa[1],
+ escape(aa[2]),
+ escape(strings.Join(aa[4:], " "))))
+ }
+ }
+ x(scanner.Err())
+}
+
+func doXml(dirname, filename string) {
+ base := filename[:len(filename)-4]
+ if _, ok := data[base]; !ok {
+ data[base] = make([]string, 0)
+ }
+ b, err := os.ReadFile(dirname + filename)
+ x(err)
+ var item Item
+ x(xml.Unmarshal(b, &item))
+ t := time.Unix(item.UnixTime, 0).In(location)
+ data[base] = append(data[base],
+ fmt.Sprintf(``,
+ t.Year(),
+ int(t.Month()),
+ t.Day()))
+}
+
+func stripMeta(s string) string {
+ i1 := strings.Index(s, "")
+ if i1 < 0 {
+ return s
+ }
+ i2 := i1 + strings.Index(s[i1:], "") + 11
+ return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n")
+}
diff --git a/HLN/txt2corpus.sh b/HLN/txt2corpus.sh
new file mode 100755
index 0000000..f05ce90
--- /dev/null
+++ b/HLN/txt2corpus.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+set -e
+
+unset CDPATH
+PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
+export TZ=Europe/Amsterdam
+. /net/aps/etc/alpino-activate.sh > /dev/null
+
+if [ "$1" = "" ]
+then
+ ds=`date -d -7days +%G-%V`
+else
+ case "$1" in
+ 2[0-9][0-9][0-9]-[0-5][0-9])
+ ds=$1
+ ;;
+ *)
+ echo INVALID
+ exit 1
+ ;;
+ esac
+fi
+
+dp=${ds//-//}
+
+corpus=/net/corpora/nlnieuws/HLN/corpus/$ds
+
+cd /net/corpora/nlnieuws/HLN/$dp
+
+ln -s lock.$$ lock
+if [ "`readlink lock`" != lock.$$ ]
+then
+ echo Getting lock failed
+ exit 1
+fi
+
+rm -fr out
+mkdir out
+
+rm -f $corpus.lines
+for i in *.txt
+do
+ b=`basename $i .txt`
+ perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \
+ | perl -e '$n = 0; while(<>) { $n++; print("hln.'$b'.$n|$_"); }' \
+ >> $corpus.lines
+done
+
+cd out
+mkdir xml
+Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
+
+../../../metadata
+
+cd xml
+rm -f $corpus.data.dz $corpus.index
+alto -q -o $corpus.data.dz *.xml
+
+# telling per bericht, niet per zin
+/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
+
+cd ../..
+rm -fr out
+
+rm -f lock
diff --git a/Makefile b/Makefile
index fbe6c0b..826a90b 100644
--- a/Makefile
+++ b/Makefile
@@ -4,6 +4,7 @@ all:
make -C BuurtAdam
make -C BuurtGrn
make -C GG
+ make -C HLN
make -C LitNL
make -C NieuwsNL
make -C NOS
diff --git a/collect.sh b/collect.sh
index fffb9f7..c67ea4e 100755
--- a/collect.sh
+++ b/collect.sh
@@ -43,11 +43,12 @@ parts[algemeen]='NOS|NU|NieuwsNL|RO|Sargasso'
parts[amsterdam]='AT5|BuurtAdam|Parool'
parts[groningen]='BuurtGrn|GG|Oog|RTVNoord|Sikkom'
parts[literatuur]='LitNL|Tzum'
-parts[vlaanderen]='VRT'
+parts[vlaanderen]='HLN|VRT'
#parts[AT5]='AT5'
#parts[BuurtAdam]='BuurtAdam'
#parts[BuurtGrn]='BuurtGrn'
#parts[GG]='GG'
+#parts[HLN]='HLN'
#parts[LitNL]='LitNL'
#parts[NOS]='NOS'
#parts[NU]='NU'