diff --git a/.gitignore b/.gitignore
index 77d784e..9a7719b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,8 @@ NU/nu
Oog/metadata
Oog/oog
Oog/xml2txt
+Parool/metadata
+Parool/parool
RO/metadata
RO/ro
RO/xml2txt
diff --git a/Amsterdam/Makefile b/Amsterdam/Makefile
deleted file mode 100644
index ac0dfb6..0000000
--- a/Amsterdam/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-all: amsterdam
-
-% : %.go
- go build $<
diff --git a/Amsterdam/amsterdam.go b/Amsterdam/amsterdam.go
deleted file mode 100644
index 063d8ea..0000000
--- a/Amsterdam/amsterdam.go
+++ /dev/null
@@ -1,200 +0,0 @@
-package main
-
-import (
- e "codeberg.org/pebbe/errors"
- "github.com/jbowtie/gokogiri"
-
- "encoding/xml"
- "fmt"
- "io"
- "net/http"
- "net/url"
- "os"
- "path/filepath"
- "strings"
- "time"
-)
-
-type Rss struct {
- XMLName xml.Name `xml:"rss"`
- Items []ItemT `xml:"channel>item"`
-}
-
-type ItemT struct {
- Title string `xml:"title"`
- PubDate string `xml:"pubDate"`
- UnixTime int64 `xml:"unixTime"`
- Guid string `xml:"guid"`
- Link string `xml:"link"`
- Data []byte `xml:",innerxml"`
-}
-
-var (
- p = e.PanicErr
- agent = "AhrefsBot/7.0"
- // agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36"
-)
-
-func exists(filename string) bool {
- _, err := os.Stat(filename)
- return err == nil
-}
-
-func main() {
- defer func() {
- if e.Panicked {
- _ = recover()
- os.Exit(1)
- }
- }()
-
- myLock := "/net/corpora/nlnieuws/Amsterdam/lock"
- mkLock(myLock)
- defer func() {
- _ = os.Remove(myLock)
- }()
-
- req, err := http.NewRequest("GET", "https://www.amsterdam.nl/nieuws/nieuwsoverzicht/?rss=true", nil)
- p(err)
- req.Header.Set("User-Agent", agent)
-
- client := &http.Client{}
- resp, err := client.Do(req)
- p(err)
- body, err := io.ReadAll(resp.Body)
- p(err)
- p(resp.Body.Close())
-
- var rss Rss
- p(xml.Unmarshal(body, &rss))
-
- if len(rss.Items) == 0 {
- p(fmt.Errorf("len(rss.Items) == 0"))
- }
-
- for _, item := range rss.Items {
- t, err := time.Parse(time.RFC1123Z, item.PubDate)
- if err != nil {
- t, err = time.Parse(time.RFC1123, item.PubDate)
- }
- p(err)
- dirname := fmt.Sprintf("/net/corpora/nlnieuws/Amsterdam/%d/%02d", t.Year(), int(t.Month()))
- if exists(dirname + "/lock") {
- continue
- }
- filename := dirname + "/" + url.PathEscape(strings.TrimPrefix(item.Guid, "https://www.amsterdam.nl/nieuws/"))
- p(os.MkdirAll(dirname, 0777))
- func() {
- var ok bool
- defer func() {
- if e.Panicked {
- fmt.Fprintln(os.Stderr, "----", filename, "----")
- }
- if !ok {
- _ = os.Remove(filename + ".xml")
- }
- }()
- fp, err := os.Create(filename + ".xml")
- p(err)
- p(fp.WriteString("\n- \n"))
- p(fmt.Fprintf(fp, "%d", t.Unix()))
- p(fp.Write(item.Data))
- p(fp.WriteString("
\n"))
- p(fp.Close())
- p(os.Chtimes(filename+".xml", t, t))
- doArticle(filename, item.Link, item.Title, t)
- ok = true
- }()
- }
-}
-
-func doArticle(filename string, url string, title string, timestamp time.Time) {
- if exists(filename + ".txt") {
- return
- }
- time.Sleep(2 * time.Second)
-
- req, err := http.NewRequest("GET", url, nil)
- p(err)
- req.Header.Set("User-Agent", agent)
-
- client := &http.Client{}
- resp, err := client.Do(req)
- p(err)
- body, err := io.ReadAll(resp.Body)
- p(err)
- p(resp.Body.Close())
-
- doc, err := gokogiri.ParseHtml(body)
- p(err)
-
- root := doc.Root()
-
- fp, err := os.Create(filename + ".txt")
- p(err)
-
- p(fp.WriteString(addEnd(title)))
-
- count := 0
-
- pp, err := root.Search(`//div[@id="zone_intro"]//div[contains(@class, "inleiding")]/p`)
- p(err)
- for _, p1 := range pp {
- p(fp.WriteString(addEnd(p1.Content())))
- count++
- }
-
- ell, err := root.Search(`//div[@id="zone_content"]//div[contains(@class, "tekst")]/child::*`)
- p(err)
- for _, el := range ell {
- if n := el.Name(); n == "p" || n == "h3" {
- p(fp.WriteString(addEnd(el.Content())))
- count++
- }
- }
-
- p(fp.Close())
-
- p(os.Chtimes(filename+".txt", timestamp, timestamp))
-
- if count == 0 {
- fp, err := os.Create(filename + ".debug.html")
- p(err)
- p(fp.Write(body))
- p(fp.Close())
- p(os.Chtimes(filename+".debug.html", timestamp, timestamp))
- }
-}
-
-func addEnd(s string) string {
- s = strings.TrimSpace(s)
- n := len(s)
- if n == 0 {
- return ""
- }
- if n > 0 {
- if strings.ContainsAny(s[n-1:], ".!?") {
- return s + "\n"
- }
- }
- if n > 1 {
- s2 := s[n-2:]
- if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
- return s + "\n"
- }
- }
- return s + ".\n"
-}
-
-func mkLock(filename string) {
- pid := os.Getpid()
- link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
- p(os.Symlink(link, filename))
-
- name, err := os.Readlink(filename)
- p(err)
-
- if name != link {
- p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
- }
-}
diff --git a/Makefile b/Makefile
index 23b05ee..fbe6c0b 100644
--- a/Makefile
+++ b/Makefile
@@ -9,6 +9,7 @@ all:
make -C NOS
make -C NU
make -C Oog
+ make -C Parool
make -C RO
make -C RTVNoord
make -C Sargasso
diff --git a/Parool/Makefile b/Parool/Makefile
new file mode 100644
index 0000000..3575e76
--- /dev/null
+++ b/Parool/Makefile
@@ -0,0 +1,9 @@
+all: \
+ metadata \
+ parool
+
+metadata: cmd/metadata/*.go
+ go build -o $@ $^
+
+parool: cmd/parool/*.go
+ go build -o $@ $^
diff --git a/Parool/cmd/metadata/metadata.go b/Parool/cmd/metadata/metadata.go
new file mode 100644
index 0000000..6427f23
--- /dev/null
+++ b/Parool/cmd/metadata/metadata.go
@@ -0,0 +1,131 @@
+package main
+
+import (
+ e "codeberg.org/pebbe/errors"
+
+ "bufio"
+ "encoding/xml"
+ "fmt"
+ "html"
+ "os"
+ "strings"
+ "time"
+)
+
+type Item struct {
+ XMLName xml.Name `xml:"item"`
+ UnixTime int64 `xml:"unixTime"`
+}
+
+var (
+ x = e.ExitErr
+ escape = html.EscapeString
+ data = make(map[string][]string)
+ location *time.Location
+)
+
+func main() {
+ var err error
+ location, err = time.LoadLocation("Europe/Amsterdam")
+ x(err)
+
+ files, err := os.ReadDir(".")
+ x(err)
+ for _, file := range files {
+ filename := file.Name()
+ if strings.HasSuffix(filename, ".txt") {
+ doText("", filename)
+ } else if strings.HasSuffix(filename, ".xml") {
+ doXml("", filename)
+ }
+ }
+ files, err = os.ReadDir("..")
+ x(err)
+ for _, file := range files {
+ filename := file.Name()
+ if strings.HasSuffix(filename, ".txt") {
+ doText("../", filename)
+ } else if strings.HasSuffix(filename, ".xml") {
+ doXml("../", filename)
+ }
+ }
+
+ files, err = os.ReadDir("xml")
+ x(err)
+ for _, file := range files {
+ filename := file.Name()
+ if !strings.HasSuffix(filename, ".xml") {
+ continue
+ }
+ aa := strings.Split(filename, ".")
+ base := strings.Join(aa[1:len(aa)-2], ".")
+ b, err := os.ReadFile("xml/" + filename)
+ x(err)
+ s := string(b)
+ i := strings.Index(s, "\n \n"))
+ for _, m := range data[base] {
+ x(fp.WriteString(" " + m + "\n"))
+ }
+ x(fp.WriteString(" \n "))
+ x(fp.WriteString(stripMeta(s[i:])))
+ x(fp.Close())
+ x(os.Rename("xml/"+filename+".tmp", "xml/"+filename))
+ }
+}
+
+func doText(dirname, filename string) {
+ base := filename[:len(filename)-4]
+ if _, ok := data[base]; !ok {
+ data[base] = make([]string, 0)
+ }
+ fp, err := os.Open(dirname + filename)
+ x(err)
+ defer func() { x(fp.Close()) }()
+ scanner := bufio.NewScanner(fp)
+ for scanner.Scan() {
+ line := scanner.Text()
+ if !strings.HasPrefix(line, "##META") {
+ continue
+ }
+ aa := strings.Fields(line)
+ if len(aa) > 4 {
+ data[base] = append(data[base],
+ fmt.Sprintf(``,
+ aa[1],
+ escape(aa[2]),
+ escape(strings.Join(aa[4:], " "))))
+ }
+ }
+ x(scanner.Err())
+}
+
+func doXml(dirname, filename string) {
+ base := filename[:len(filename)-4]
+ if _, ok := data[base]; !ok {
+ data[base] = make([]string, 0)
+ }
+ b, err := os.ReadFile(dirname + filename)
+ x(err)
+ var item Item
+ x(xml.Unmarshal(b, &item))
+ t := time.Unix(item.UnixTime, 0).In(location)
+ data[base] = append(data[base],
+ fmt.Sprintf(``,
+ t.Year(),
+ int(t.Month()),
+ t.Day()))
+}
+
+func stripMeta(s string) string {
+ i1 := strings.Index(s, "")
+ if i1 < 0 {
+ return s
+ }
+ i2 := i1 + strings.Index(s[i1:], "") + 11
+ return s[:i1] + strings.TrimLeft(s[i2:], " \t\r\n")
+}
diff --git a/Parool/cmd/parool/parool.go b/Parool/cmd/parool/parool.go
new file mode 100644
index 0000000..1ab5502
--- /dev/null
+++ b/Parool/cmd/parool/parool.go
@@ -0,0 +1,318 @@
+package main
+
+import (
+ e "codeberg.org/pebbe/errors"
+ "github.com/jbowtie/gokogiri"
+
+ "encoding/xml"
+ "fmt"
+ "io"
+ "net/http"
+ "net/url"
+ "os"
+ "path/filepath"
+ "strings"
+ "time"
+)
+
+type Rss struct {
+ XMLName xml.Name `xml:"rss"`
+ Items []ItemT `xml:"channel>item"`
+}
+
+type ItemT struct {
+ PubDate string `xml:"pubDate"`
+ UnixTime int64 `xml:"unixTime"`
+ Guid string `xml:"guid"`
+ Link string `xml:"link"`
+ Data []byte `xml:",innerxml"`
+}
+
+var (
+ p = e.PanicErr
+ w = e.WarnErr
+ agent = "AhrefsBot/7.0"
+)
+
+func exists(filename string) bool {
+ _, err := os.Stat(filename)
+ return err == nil
+}
+
+func fileDate(filename string) string {
+ b, err := os.ReadFile(filename)
+ if err != nil {
+ return ""
+ }
+ s := string(b)
+ i1 := strings.Index(s, "") + 10
+ i2 := strings.Index(s, "")
+ if i2 < i1 {
+ return ""
+ }
+ return s[i1:i2]
+}
+
+func main() {
+ defer func() {
+ if e.Panicked {
+ _ = recover()
+ os.Exit(1)
+ }
+ }()
+
+ myLock := "/net/corpora/nlnieuws/Parool/lock"
+ mkLock(myLock)
+ defer func() {
+ _ = os.Remove(myLock)
+ }()
+
+ req, err := http.NewRequest("GET", "https://www.parool.nl/amsterdam/rss.xml", nil)
+ p(err)
+ req.Header.Set("User-Agent", agent)
+
+ client := &http.Client{}
+ resp, err := client.Do(req)
+ p(err)
+ body, err := io.ReadAll(resp.Body)
+ p(err)
+ p(resp.Body.Close())
+
+ var rss Rss
+ p(xml.Unmarshal(body, &rss))
+
+ if len(rss.Items) == 0 {
+ p(fmt.Errorf("len(rss.Items) == 0"))
+ }
+
+ for _, item := range rss.Items {
+ t, err := time.Parse(time.RFC1123Z, item.PubDate)
+ if err != nil {
+ t, err = time.Parse(time.RFC1123, item.PubDate)
+ }
+ p(err)
+ year, week := t.ISOWeek()
+ dirname := fmt.Sprintf("/net/corpora/nlnieuws/Parool/%d/%02d", year, week)
+ if exists(dirname + "/lock") {
+ continue
+ }
+ basename := item.Guid
+ filename := dirname + "/" + url.PathEscape(basename)
+
+ ts := fmt.Sprintf("%d", t.Unix())
+ needUpdate := fileDate(filename+".xml") != ts
+
+ p(os.MkdirAll(dirname, 0777))
+ func() {
+ var ok bool
+ defer func() {
+ if e.Panicked {
+ fmt.Fprintln(os.Stderr, "----", filename)
+ fmt.Fprintln(os.Stderr, "----", item.Link)
+ }
+ if !ok {
+ _ = os.Remove(filename + ".xml")
+ }
+ }()
+ fp, err := os.Create(filename + ".xml")
+ p(err)
+ p(fp.WriteString("\n- \n"))
+ p(fmt.Fprintf(fp, "%d", t.Unix()))
+ p(fp.Write(item.Data))
+ p(fp.WriteString("
\n"))
+ p(fp.Close())
+ p(os.Chtimes(filename+".xml", t, t))
+ ok = doArticle(filename, item.Link, t, needUpdate)
+ }()
+ }
+}
+
+func doArticle(filename string, url string, timestamp time.Time, needUpdate bool) (ok bool) {
+ if exists(filename + ".skip") {
+ return true
+ }
+ if needUpdate {
+ _ = os.Remove(filename + ".err")
+ _ = os.Remove(filename + ".html")
+ _ = os.Remove(filename + ".txt")
+ } else {
+ if exists(filename + ".txt") {
+ return true
+ }
+ }
+ time.Sleep(2 * time.Second)
+
+ req, err := http.NewRequest("GET", url, nil)
+ p(err)
+ req.Header.Set("User-Agent", agent)
+
+ client := &http.Client{}
+ resp, err := client.Do(req)
+ p(err)
+ body, err := io.ReadAll(resp.Body)
+ p(err)
+ p(resp.Body.Close())
+
+ doc, err := gokogiri.ParseHtml(body)
+ p(err)
+
+ root := doc.Root()
+
+ articles, err := root.Search(`//article[@id="article-content"]`)
+ p(err)
+ if len(articles) == 0 {
+ _ = w(fmt.Errorf("empty: %s", url))
+
+ fp, err := os.Create(filename + ".err")
+ p(err)
+ p(fmt.Fprintf(fp, "empty: %s\n", url))
+ p(fp.Close())
+ p(os.Chtimes(filename+".err", timestamp, timestamp))
+
+ fp, err = os.Create(filename + ".html")
+ p(err)
+ p(fp.Write(body))
+ p(fp.Close())
+ p(os.Chtimes(filename+".html", timestamp, timestamp))
+
+ return false
+ }
+ article := articles[0]
+
+ tags := make([]string, 0)
+ ell, err := article.Search(`//header//*[@data-test-id="article-label"]`)
+ p(err)
+ for _, el := range ell {
+ s := strings.TrimSpace(el.Content())
+ if s != "" {
+ tags = append(tags, s)
+ }
+ }
+
+ fouten := make([]string, 0)
+ pars := make([]string, 0)
+
+ ell, err = article.Search(`//header//*[@data-test-id="article-title"]`)
+ p(err)
+ for _, el := range ell {
+ s := strings.TrimSpace(el.Content())
+ if s != "" {
+ pars = append(pars, s)
+ }
+ }
+
+ found := false
+ ell, err = article.Search(`//header//*[@data-test-id="header-intro"]`)
+ p(err)
+ for _, el := range ell {
+ s := strings.TrimSpace(el.Content())
+ if s != "" {
+ pars = append(pars, s)
+ found = true
+ }
+ }
+ if !found {
+ fouten = append(fouten, fmt.Sprintf("no heading: %s\n", url))
+ _ = w(fmt.Errorf("no heading: %s", url))
+ }
+
+ specials, err := article.Search(`//section//aside | //section//figure | //section//b`)
+ p(err)
+ for _, special := range specials {
+ special.Remove()
+ }
+
+ found = false
+ ell, err = article.Search(`//section//*[@data-article-element-index]`)
+ p(err)
+ for _, el := range ell {
+ s := strings.TrimSpace(el.Content())
+ if s != "" {
+ pars = append(pars, s)
+ found = true
+ }
+ }
+ if !found {
+ fouten = append(fouten, fmt.Sprintf("no text: %s\n", url))
+ _ = w(fmt.Errorf("no text: %s", url))
+ }
+
+ if len(fouten) > 0 {
+ fp, err := os.Create(filename + ".err")
+ p(err)
+ for _, fout := range fouten {
+ p(fp.WriteString(fout))
+ }
+ p(fp.Close())
+ p(os.Chtimes(filename+".err", timestamp, timestamp))
+
+ fp, err = os.Create(filename + ".html")
+ p(err)
+ p(fp.Write(body))
+ p(fp.Close())
+ p(os.Chtimes(filename+".html", timestamp, timestamp))
+
+ return false
+ }
+
+ fp, err := os.Create(filename + ".txt")
+ p(err)
+
+ if len(tags) == 0 {
+ p(fmt.Fprintln(fp, "##META text tag ="))
+ } else {
+ for _, tag := range tags {
+ p(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag)))
+ }
+ }
+
+ for _, par := range pars {
+ p(fp.WriteString(addEnd(fixSpace(par))))
+ }
+
+ p(fp.Close())
+
+ p(os.Chtimes(filename+".txt", timestamp, timestamp))
+
+ return true
+}
+
+func addEnd(s string) string {
+ s = strings.TrimSpace(s)
+ n := len(s)
+ if n == 0 {
+ return ""
+ }
+ if n > 0 {
+ if strings.ContainsAny(s[n-1:], ".!?") {
+ return s + "\n"
+ }
+ }
+ if n > 1 {
+ s2 := s[n-2:]
+ if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
+ return s + "\n"
+ }
+ }
+ if strings.HasSuffix(s, `.”`) || strings.HasSuffix(s, `!”`) || strings.HasSuffix(s, `?”`) {
+ return s + "\n"
+ }
+ return s + ".\n"
+}
+
+func fixSpace(s string) string {
+ return strings.Join(strings.Fields(s), " ")
+}
+
+func mkLock(filename string) {
+ pid := os.Getpid()
+ link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid)
+ p(os.Symlink(link, filename))
+
+ name, err := os.Readlink(filename)
+ p(err)
+
+ if name != link {
+ p(fmt.Errorf("wrong lock name %q, should be %q", name, link))
+ }
+}
diff --git a/Parool/txt2corpus.sh b/Parool/txt2corpus.sh
new file mode 100755
index 0000000..46b0e62
--- /dev/null
+++ b/Parool/txt2corpus.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+set -e
+
+unset CDPATH
+PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
+export TZ=Europe/Amsterdam
+. /net/aps/etc/alpino-activate.sh > /dev/null
+
+if [ "$1" = "" ]
+then
+ ds=`date -d -7days +%G-%V`
+else
+ case "$1" in
+ 2[0-9][0-9][0-9]-[0-5][0-9])
+ ds=$1
+ ;;
+ *)
+ echo INVALID
+ exit 1
+ ;;
+ esac
+fi
+
+dp=${ds//-//}
+
+corpus=/net/corpora/nlnieuws/Parool/corpus/$ds
+
+cd /net/corpora/nlnieuws/Parool/$dp
+
+ln -s lock.$$ lock
+if [ "`readlink lock`" != lock.$$ ]
+then
+ echo Getting lock failed
+ exit 1
+fi
+
+rm -fr out
+mkdir out
+
+rm -f $corpus.lines
+for i in *.txt
+do
+ b=`basename $i .txt`
+ perl -p -e 's/^\s*//; s/^##META.*\n//' $i | tokenize.sh \
+ | perl -e '$n = 0; while(<>) { $n++; print("parool.'$b'.$n|$_"); }' \
+ >> $corpus.lines
+done
+
+cd out
+mkdir xml
+Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.lines 2> $corpus.log
+
+../../../metadata
+
+cd xml
+rm -f $corpus.data.dz $corpus.index
+alto -q -o $corpus.data.dz *.xml
+
+# telling per bericht, niet per zin
+/net/corpora/nlnieuws/namen.sh -x T -s $corpus.data.dz > $corpus.tag.txt
+
+cd ../..
+rm -fr out
+
+rm -f lock
diff --git a/collect.sh b/collect.sh
index a76ef57..fffb9f7 100755
--- a/collect.sh
+++ b/collect.sh
@@ -40,7 +40,7 @@ cd /net/corpora/nlnieuws/data
declare -A parts
#parts[alles]='.'
parts[algemeen]='NOS|NU|NieuwsNL|RO|Sargasso'
-parts[amsterdam]='AT5|BuurtAdam'
+parts[amsterdam]='AT5|BuurtAdam|Parool'
parts[groningen]='BuurtGrn|GG|Oog|RTVNoord|Sikkom'
parts[literatuur]='LitNL|Tzum'
parts[vlaanderen]='VRT'
@@ -53,6 +53,7 @@ parts[vlaanderen]='VRT'
#parts[NU]='NU'
#parts[NieuwsNL]='NieuwsNL'
#parts[Oog]='Oog'
+#parts[Parool]='Parool'
#parts[RO]='RO'
#parts[RTVNoord]='RTVNoord'
#parts[Sargasso]='Sargasso'