From a94b190108c45ee6459556f41b57adada8de4341 Mon Sep 17 00:00:00 2001 From: Peter Kleiweg Date: Mon, 2 Mar 2026 15:42:01 +0100 Subject: [PATCH] ISODate -> ISOWeek; exists(dirname + "/lock") --- .gitignore | 54 +++++----- AT5/cmd/at5/at5.go | 8 ++ AT5/txt2corpus.sh | 2 +- Amsterdam/Makefile | 4 + Amsterdam/amsterdam.go | 169 ++++++++++++++++++++++++++++++ GG/cmd/gg/gg.go | 3 + GG/txt2corpus.sh | 2 +- NOS/cmd/nos/nos.go | 3 + NOS/txt2corpus.sh | 2 +- NU/cmd/nu/nu.go | 3 + NU/txt2corpus.sh | 2 +- NieuwsNL/cmd/nieuwsnl/nieuwsnl.go | 3 + NieuwsNL/txt2corpus.sh | 2 +- RO/cmd/ro/ro.go | 8 ++ RO/txt2corpus.sh | 2 +- Sargasso/cmd/sargasso/sargasso.go | 8 ++ Sargasso/txt2corpus.sh | 2 +- Sikkom/cmd/sikkom/sikkom.go | 3 + Sikkom/txt2corpus.sh | 2 +- Tzum/cmd/tzum/tzum.go | 8 ++ Tzum/txt2corpus.sh | 2 +- VRT/cmd/vrt/vrt.go | 3 + VRT/txt2corpus.sh | 2 +- 23 files changed, 260 insertions(+), 37 deletions(-) create mode 100644 Amsterdam/Makefile create mode 100644 Amsterdam/amsterdam.go diff --git a/.gitignore b/.gitignore index 16a96c1..4cb20fc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,29 +1,29 @@ -/Amsterdam -/AT5/at5 -/AT5/metadata -/AT5/xml2txt -/GG/gg -/GG/metadata -/NieuwsNL/metadata -/NieuwsNL/nieuwsnl -/NOS/json2txt -/NOS/metadata -/NOS/nos -/NU/metadata -/NU/nu -/RO/metadata -/RO/ro -/RO/xml2txt -/Sargasso/metadata -/Sargasso/sargasso -/Sargasso/xml2txt -/Sikkom/metadata -/Sikkom/sikkom -/Tzum/metadata -/Tzum/tzum -/Tzum/xml2txt -/VRT/metadata -/VRT/vrt -/bin/ISOWeek +Amsterdam/amsterdam +AT5/at5 +AT5/metadata +AT5/xml2txt +GG/gg +GG/metadata +NieuwsNL/metadata +NieuwsNL/nieuwsnl +NOS/json2txt +NOS/metadata +NOS/nos +NU/metadata +NU/nu +RO/metadata +RO/ro +RO/xml2txt +Sargasso/metadata +Sargasso/sargasso +Sargasso/xml2txt +Sikkom/metadata +Sikkom/sikkom +Tzum/metadata +Tzum/tzum +Tzum/xml2txt +VRT/metadata +VRT/vrt +bin/ISOWeek 20?? corpus diff --git a/AT5/cmd/at5/at5.go b/AT5/cmd/at5/at5.go index 841cdc7..ca9ba6b 100644 --- a/AT5/cmd/at5/at5.go +++ b/AT5/cmd/at5/at5.go @@ -30,6 +30,11 @@ var ( agent = "AhrefsBot/7.0" ) +func exists(filename string) bool { + _, err := os.Stat(filename) + return err == nil +} + func main() { req, err := http.NewRequest("GET", "https://rss.at5.nl/rss", nil) x(err) @@ -57,6 +62,9 @@ func main() { x(err) year, week := t.ISOWeek() dirname := fmt.Sprintf("/net/corpora/nlnieuws/AT5/%d/%02d", year, week) + if exists(dirname + "/lock") { + continue + } basename := strings.TrimPrefix(item.Guid, "https://www.at5.nl/artikelen/") if i := strings.LastIndex(basename, "/"); i > 0 { basename = basename[:i] diff --git a/AT5/txt2corpus.sh b/AT5/txt2corpus.sh index f0662fb..5bdbff2 100755 --- a/AT5/txt2corpus.sh +++ b/AT5/txt2corpus.sh @@ -9,7 +9,7 @@ export TZ=Europe/Amsterdam if [ "$1" = "" ] then - ds=`ISODate -7` + ds=`ISOWeek -7` else case "$1" in 2[0-9][0-9][0-9]-[0-5][0-9]) diff --git a/Amsterdam/Makefile b/Amsterdam/Makefile new file mode 100644 index 0000000..ac0dfb6 --- /dev/null +++ b/Amsterdam/Makefile @@ -0,0 +1,4 @@ +all: amsterdam + +% : %.go + go build $< diff --git a/Amsterdam/amsterdam.go b/Amsterdam/amsterdam.go new file mode 100644 index 0000000..8a70075 --- /dev/null +++ b/Amsterdam/amsterdam.go @@ -0,0 +1,169 @@ +package main + +import ( + "github.com/jbowtie/gokogiri" + "github.com/pebbe/util" + + "encoding/xml" + "fmt" + "io" + "net/http" + "net/url" + "os" + "strings" + "time" +) + +type Rss struct { + XMLName xml.Name `xml:"rss"` + Items []ItemT `xml:"channel>item"` +} + +type ItemT struct { + Title string `xml:"title"` + PubDate string `xml:"pubDate"` + UnixTime int64 `xml:"unixTime"` + Guid string `xml:"guid"` + Link string `xml:"link"` + Data []byte `xml:",innerxml"` +} + +var ( + x = util.CheckErr + agent = "AhrefsBot/7.0" + // agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36" +) + +func exists(filename string) bool { + _, err := os.Stat(filename) + return err == nil +} + +func main() { + req, err := http.NewRequest("GET", "https://www.amsterdam.nl/nieuws/nieuwsoverzicht/?rss=true", nil) + x(err) + req.Header.Set("User-Agent", agent) + + client := &http.Client{} + resp, err := client.Do(req) + x(err) + body, err := io.ReadAll(resp.Body) + x(err) + x(resp.Body.Close()) + + var rss Rss + x(xml.Unmarshal(body, &rss)) + + if len(rss.Items) == 0 { + x(fmt.Errorf("len(rss.Items) == 0")) + } + + for _, item := range rss.Items { + t, err := time.Parse(time.RFC1123Z, item.PubDate) + if err != nil { + t, err = time.Parse(time.RFC1123, item.PubDate) + } + x(err) + dirname := fmt.Sprintf("/net/corpora/nlnieuws/amsterdam/%d/%02d", t.Year(), int(t.Month())) + if exists(dirname + "/lock") { + continue + } + filename := dirname + "/" + url.PathEscape(strings.TrimPrefix(item.Guid, "https://www.amsterdam.nl/nieuws/")) + x(os.MkdirAll(dirname, 0777)) + fp, err := os.Create(filename + ".xml") + x(err) + _, err = fp.WriteString("\n\n") + x(err) + _, err = fmt.Fprintf(fp, "%d", t.Unix()) + x(err) + _, err = fp.Write(item.Data) + x(err) + _, err = fp.WriteString("\n") + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".xml", t, t)) + doArticle(filename, item.Link, item.Title, t) + } +} + +func doArticle(filename string, url string, title string, timestamp time.Time) { + if exists(filename + ".txt") { + return + } + time.Sleep(2 * time.Second) + + req, err := http.NewRequest("GET", url, nil) + x(err) + req.Header.Set("User-Agent", agent) + + client := &http.Client{} + resp, err := client.Do(req) + x(err) + body, err := io.ReadAll(resp.Body) + x(err) + x(resp.Body.Close()) + + doc, err := gokogiri.ParseHtml(body) + x(err) + + root := doc.Root() + + fp, err := os.Create(filename + ".txt") + x(err) + + _, err = fp.WriteString(addEnd(title)) + x(err) + + count := 0 + + pp, err := root.Search(`//div[@id="zone_intro"]//div[contains(@class, "inleiding")]/p`) + x(err) + for _, p := range pp { + _, err = fp.WriteString(addEnd(p.Content())) + x(err) + count++ + } + + ell, err := root.Search(`//div[@id="zone_content"]//div[contains(@class, "tekst")]/child::*`) + x(err) + for _, el := range ell { + if n := el.Name(); n == "p" || n == "h3" { + _, err = fp.WriteString(addEnd(el.Content())) + count++ + x(err) + } + } + + x(fp.Close()) + + x(os.Chtimes(filename+".txt", timestamp, timestamp)) + + if count == 0 { + fp, err := os.Create(filename + ".debug.html") + x(err) + _, err = fp.Write(body) + x(err) + x(fp.Close()) + x(os.Chtimes(filename+".debug.html", timestamp, timestamp)) + } +} + +func addEnd(s string) string { + s = strings.TrimSpace(s) + n := len(s) + if n == 0 { + return "" + } + if n > 0 { + if strings.ContainsAny(s[n-1:], ".!?") { + return s + "\n" + } + } + if n > 1 { + s2 := s[n-2:] + if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { + return s + "\n" + } + } + return s + ".\n" +} diff --git a/GG/cmd/gg/gg.go b/GG/cmd/gg/gg.go index f0bf530..96869d9 100644 --- a/GG/cmd/gg/gg.go +++ b/GG/cmd/gg/gg.go @@ -78,6 +78,9 @@ func main() { x(err) year, week := t.ISOWeek() dirname := fmt.Sprintf("/net/corpora/nlnieuws/GG/%d/%02d", year, week) + if exists(dirname + "/lock") { + continue + } filename := dirname + "/" + url.PathEscape(item.Guid) ts := fmt.Sprintf("%d", t.Unix()) diff --git a/GG/txt2corpus.sh b/GG/txt2corpus.sh index 354c18d..628d432 100755 --- a/GG/txt2corpus.sh +++ b/GG/txt2corpus.sh @@ -9,7 +9,7 @@ export TZ=Europe/Amsterdam if [ "$1" = "" ] then - ds=`ISODate -7` + ds=`ISOWeek -7` else case "$1" in 2[0-9][0-9][0-9]-[0-5][0-9]) diff --git a/NOS/cmd/nos/nos.go b/NOS/cmd/nos/nos.go index 87cd7be..9707638 100644 --- a/NOS/cmd/nos/nos.go +++ b/NOS/cmd/nos/nos.go @@ -77,6 +77,9 @@ func main() { x(err) year, week := t.ISOWeek() dirname := fmt.Sprintf("/net/corpora/nlnieuws/NOS/%d/%02d", year, week) + if exists(dirname + "/lock") { + continue + } filename := dirname + "/" + url.PathEscape(strings.TrimPrefix(item.Guid, "https://nos.nl/l/")) ts := fmt.Sprintf("%d", t.Unix()) diff --git a/NOS/txt2corpus.sh b/NOS/txt2corpus.sh index 1735991..4f29212 100755 --- a/NOS/txt2corpus.sh +++ b/NOS/txt2corpus.sh @@ -9,7 +9,7 @@ export TZ=Europe/Amsterdam if [ "$1" = "" ] then - ds=`ISODate -7` + ds=`ISOWeek -7` else case "$1" in 2[0-9][0-9][0-9]-[0-5][0-9]) diff --git a/NU/cmd/nu/nu.go b/NU/cmd/nu/nu.go index 956ba7c..2749a07 100644 --- a/NU/cmd/nu/nu.go +++ b/NU/cmd/nu/nu.go @@ -82,6 +82,9 @@ func main() { x(err) year, week := t.ISOWeek() dirname := fmt.Sprintf("/net/corpora/nlnieuws/NU/%d/%02d", year, week) + if exists(dirname + "/lock") { + continue + } filename := dirname + "/" + url.PathEscape(item.Guid) ts := fmt.Sprintf("%d", t.Unix()) diff --git a/NU/txt2corpus.sh b/NU/txt2corpus.sh index eec98e0..4c18246 100755 --- a/NU/txt2corpus.sh +++ b/NU/txt2corpus.sh @@ -9,7 +9,7 @@ export TZ=Europe/Amsterdam if [ "$1" = "" ] then - ds=`ISODate -7` + ds=`ISOWeek -7` else case "$1" in 2[0-9][0-9][0-9]-[0-5][0-9]) diff --git a/NieuwsNL/cmd/nieuwsnl/nieuwsnl.go b/NieuwsNL/cmd/nieuwsnl/nieuwsnl.go index c8d986e..f1cda3d 100644 --- a/NieuwsNL/cmd/nieuwsnl/nieuwsnl.go +++ b/NieuwsNL/cmd/nieuwsnl/nieuwsnl.go @@ -77,6 +77,9 @@ func main() { } x(err) dirname := fmt.Sprintf("/net/corpora/nlnieuws/NieuwsNL/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day()) + if exists(dirname + "/lock") { + continue + } filename := dirname + "/" + url.PathEscape(strings.TrimPrefix(item.Guid, "urn:uuid:")) ts := fmt.Sprintf("%d", t.Unix()) diff --git a/NieuwsNL/txt2corpus.sh b/NieuwsNL/txt2corpus.sh index d794a73..6a93640 100755 --- a/NieuwsNL/txt2corpus.sh +++ b/NieuwsNL/txt2corpus.sh @@ -11,7 +11,7 @@ if [ "$1" = "" ] then # nieuws.nl gaat per dag, niet per week # dus gegevens van 2 dagen geleden, niet een week geleden - ds=`ISODate -2` + ds=`date -d -2days +%Y-%m-%d` else case "$1" in 2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9]) diff --git a/RO/cmd/ro/ro.go b/RO/cmd/ro/ro.go index 1a45943..045531c 100644 --- a/RO/cmd/ro/ro.go +++ b/RO/cmd/ro/ro.go @@ -30,6 +30,11 @@ var ( agent = "AhrefsBot/7.0" ) +func exists(filename string) bool { + _, err := os.Stat(filename) + return err == nil +} + func main() { req, err := http.NewRequest("GET", "https://reportersonline.nl/feed/", nil) x(err) @@ -57,6 +62,9 @@ func main() { x(err) year, week := t.ISOWeek() dirname := fmt.Sprintf("/net/corpora/nlnieuws/RO/%d/%02d", year, week) + if exists(dirname + "/lock") { + continue + } basename := strings.TrimPrefix(item.Guid, "https://reportersonline.nl/?p=") if i := strings.LastIndex(basename, "/"); i > 0 { basename = basename[:i] diff --git a/RO/txt2corpus.sh b/RO/txt2corpus.sh index b7f7e20..a2017e4 100755 --- a/RO/txt2corpus.sh +++ b/RO/txt2corpus.sh @@ -9,7 +9,7 @@ export TZ=Europe/Amsterdam if [ "$1" = "" ] then - ds=`ISODate -7` + ds=`ISOWeek -7` else case "$1" in 2[0-9][0-9][0-9]-[0-5][0-9]) diff --git a/Sargasso/cmd/sargasso/sargasso.go b/Sargasso/cmd/sargasso/sargasso.go index 8110fb6..1428972 100644 --- a/Sargasso/cmd/sargasso/sargasso.go +++ b/Sargasso/cmd/sargasso/sargasso.go @@ -30,6 +30,11 @@ var ( agent = "AhrefsBot/7.0" ) +func exists(filename string) bool { + _, err := os.Stat(filename) + return err == nil +} + func main() { req, err := http.NewRequest("GET", "https://sargasso.nl/feed/", nil) x(err) @@ -57,6 +62,9 @@ func main() { x(err) year, week := t.ISOWeek() dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sargasso/%d/%02d", year, week) + if exists(dirname + "/lock") { + continue + } basename := strings.TrimPrefix(item.Guid, "https://sargasso.nl/?") if i := strings.LastIndex(basename, "p="); i >= 0 { basename = basename[i+2:] diff --git a/Sargasso/txt2corpus.sh b/Sargasso/txt2corpus.sh index d656205..ae709e6 100755 --- a/Sargasso/txt2corpus.sh +++ b/Sargasso/txt2corpus.sh @@ -9,7 +9,7 @@ export TZ=Europe/Amsterdam if [ "$1" = "" ] then - ds=`ISODate -7` + ds=`ISOWeek -7` else case "$1" in 2[0-9][0-9][0-9]-[0-5][0-9]) diff --git a/Sikkom/cmd/sikkom/sikkom.go b/Sikkom/cmd/sikkom/sikkom.go index 63b3586..47e8fbf 100644 --- a/Sikkom/cmd/sikkom/sikkom.go +++ b/Sikkom/cmd/sikkom/sikkom.go @@ -72,6 +72,9 @@ func main() { x(err) year, week := t.ISOWeek() dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sikkom/%d/%02d", year, week) + if exists(dirname + "/lock") { + continue + } filename := dirname + "/" + url.PathEscape(item.Guid) ts := fmt.Sprintf("%d", t.Unix()) diff --git a/Sikkom/txt2corpus.sh b/Sikkom/txt2corpus.sh index a511f18..342fda9 100755 --- a/Sikkom/txt2corpus.sh +++ b/Sikkom/txt2corpus.sh @@ -9,7 +9,7 @@ export TZ=Europe/Amsterdam if [ "$1" = "" ] then - ds=`ISODate -7` + ds=`ISOWeek -7` else case "$1" in 2[0-9][0-9][0-9]-[0-5][0-9]) diff --git a/Tzum/cmd/tzum/tzum.go b/Tzum/cmd/tzum/tzum.go index 6fe95ff..ed7acc7 100644 --- a/Tzum/cmd/tzum/tzum.go +++ b/Tzum/cmd/tzum/tzum.go @@ -30,6 +30,11 @@ var ( agent = "AhrefsBot/7.0" ) +func exists(filename string) bool { + _, err := os.Stat(filename) + return err == nil +} + func main() { req, err := http.NewRequest("GET", "https://www.tzum.info/feed/", nil) x(err) @@ -57,6 +62,9 @@ func main() { x(err) year, week := t.ISOWeek() dirname := fmt.Sprintf("/net/corpora/nlnieuws/Tzum/%d/%02d", year, week) + if exists(dirname + "/lock") { + continue + } basename := strings.TrimPrefix(item.Guid, "https://www.tzum.info/?p=") if i := strings.LastIndex(basename, "/"); i > 0 { basename = basename[:i] diff --git a/Tzum/txt2corpus.sh b/Tzum/txt2corpus.sh index b078931..f5840d3 100755 --- a/Tzum/txt2corpus.sh +++ b/Tzum/txt2corpus.sh @@ -9,7 +9,7 @@ export TZ=Europe/Amsterdam if [ "$1" = "" ] then - ds=`ISODate -7` + ds=`ISOWeek -7` else case "$1" in 2[0-9][0-9][0-9]-[0-5][0-9]) diff --git a/VRT/cmd/vrt/vrt.go b/VRT/cmd/vrt/vrt.go index 3224eca..f59ce97 100644 --- a/VRT/cmd/vrt/vrt.go +++ b/VRT/cmd/vrt/vrt.go @@ -94,6 +94,9 @@ func main() { } year, week := t.ISOWeek() dirname := fmt.Sprintf("/net/corpora/nlnieuws/VRT/%d/%02d", year, week) + if exists(dirname + "/lock") { + continue + } filename := dirname + "/" + url.PathEscape(strings.TrimPrefix(item.ID, "https://vrtnws.be/")) ts := fmt.Sprintf("%d", t.Unix()) diff --git a/VRT/txt2corpus.sh b/VRT/txt2corpus.sh index ef99eea..37a2969 100755 --- a/VRT/txt2corpus.sh +++ b/VRT/txt2corpus.sh @@ -9,7 +9,7 @@ export TZ=Europe/Amsterdam if [ "$1" = "" ] then - ds=`ISODate -7` + ds=`ISOWeek -7` else case "$1" in 2[0-9][0-9][0-9]-[0-5][0-9])