ISODate -> ISOWeek; exists(dirname + "/lock")

This commit is contained in:
Peter Kleiweg
2026-03-02 15:42:01 +01:00
parent 36f051a8a9
commit a94b190108
23 changed files with 260 additions and 37 deletions

54
.gitignore vendored
View File

@@ -1,29 +1,29 @@
/Amsterdam Amsterdam/amsterdam
/AT5/at5 AT5/at5
/AT5/metadata AT5/metadata
/AT5/xml2txt AT5/xml2txt
/GG/gg GG/gg
/GG/metadata GG/metadata
/NieuwsNL/metadata NieuwsNL/metadata
/NieuwsNL/nieuwsnl NieuwsNL/nieuwsnl
/NOS/json2txt NOS/json2txt
/NOS/metadata NOS/metadata
/NOS/nos NOS/nos
/NU/metadata NU/metadata
/NU/nu NU/nu
/RO/metadata RO/metadata
/RO/ro RO/ro
/RO/xml2txt RO/xml2txt
/Sargasso/metadata Sargasso/metadata
/Sargasso/sargasso Sargasso/sargasso
/Sargasso/xml2txt Sargasso/xml2txt
/Sikkom/metadata Sikkom/metadata
/Sikkom/sikkom Sikkom/sikkom
/Tzum/metadata Tzum/metadata
/Tzum/tzum Tzum/tzum
/Tzum/xml2txt Tzum/xml2txt
/VRT/metadata VRT/metadata
/VRT/vrt VRT/vrt
/bin/ISOWeek bin/ISOWeek
20?? 20??
corpus corpus

View File

@@ -30,6 +30,11 @@ var (
agent = "AhrefsBot/7.0" agent = "AhrefsBot/7.0"
) )
func exists(filename string) bool {
_, err := os.Stat(filename)
return err == nil
}
func main() { func main() {
req, err := http.NewRequest("GET", "https://rss.at5.nl/rss", nil) req, err := http.NewRequest("GET", "https://rss.at5.nl/rss", nil)
x(err) x(err)
@@ -57,6 +62,9 @@ func main() {
x(err) x(err)
year, week := t.ISOWeek() year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/AT5/%d/%02d", year, week) dirname := fmt.Sprintf("/net/corpora/nlnieuws/AT5/%d/%02d", year, week)
if exists(dirname + "/lock") {
continue
}
basename := strings.TrimPrefix(item.Guid, "https://www.at5.nl/artikelen/") basename := strings.TrimPrefix(item.Guid, "https://www.at5.nl/artikelen/")
if i := strings.LastIndex(basename, "/"); i > 0 { if i := strings.LastIndex(basename, "/"); i > 0 {
basename = basename[:i] basename = basename[:i]

View File

@@ -9,7 +9,7 @@ export TZ=Europe/Amsterdam
if [ "$1" = "" ] if [ "$1" = "" ]
then then
ds=`ISODate -7` ds=`ISOWeek -7`
else else
case "$1" in case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9]) 2[0-9][0-9][0-9]-[0-5][0-9])

4
Amsterdam/Makefile Normal file
View File

@@ -0,0 +1,4 @@
all: amsterdam
% : %.go
go build $<

169
Amsterdam/amsterdam.go Normal file
View File

@@ -0,0 +1,169 @@
package main
import (
"github.com/jbowtie/gokogiri"
"github.com/pebbe/util"
"encoding/xml"
"fmt"
"io"
"net/http"
"net/url"
"os"
"strings"
"time"
)
type Rss struct {
XMLName xml.Name `xml:"rss"`
Items []ItemT `xml:"channel>item"`
}
type ItemT struct {
Title string `xml:"title"`
PubDate string `xml:"pubDate"`
UnixTime int64 `xml:"unixTime"`
Guid string `xml:"guid"`
Link string `xml:"link"`
Data []byte `xml:",innerxml"`
}
var (
x = util.CheckErr
agent = "AhrefsBot/7.0"
// agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36"
)
func exists(filename string) bool {
_, err := os.Stat(filename)
return err == nil
}
func main() {
req, err := http.NewRequest("GET", "https://www.amsterdam.nl/nieuws/nieuwsoverzicht/?rss=true", nil)
x(err)
req.Header.Set("User-Agent", agent)
client := &http.Client{}
resp, err := client.Do(req)
x(err)
body, err := io.ReadAll(resp.Body)
x(err)
x(resp.Body.Close())
var rss Rss
x(xml.Unmarshal(body, &rss))
if len(rss.Items) == 0 {
x(fmt.Errorf("len(rss.Items) == 0"))
}
for _, item := range rss.Items {
t, err := time.Parse(time.RFC1123Z, item.PubDate)
if err != nil {
t, err = time.Parse(time.RFC1123, item.PubDate)
}
x(err)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/amsterdam/%d/%02d", t.Year(), int(t.Month()))
if exists(dirname + "/lock") {
continue
}
filename := dirname + "/" + url.PathEscape(strings.TrimPrefix(item.Guid, "https://www.amsterdam.nl/nieuws/"))
x(os.MkdirAll(dirname, 0777))
fp, err := os.Create(filename + ".xml")
x(err)
_, err = fp.WriteString("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<item>\n")
x(err)
_, err = fmt.Fprintf(fp, "<unixTime>%d</unixTime>", t.Unix())
x(err)
_, err = fp.Write(item.Data)
x(err)
_, err = fp.WriteString("</item>\n")
x(err)
x(fp.Close())
x(os.Chtimes(filename+".xml", t, t))
doArticle(filename, item.Link, item.Title, t)
}
}
func doArticle(filename string, url string, title string, timestamp time.Time) {
if exists(filename + ".txt") {
return
}
time.Sleep(2 * time.Second)
req, err := http.NewRequest("GET", url, nil)
x(err)
req.Header.Set("User-Agent", agent)
client := &http.Client{}
resp, err := client.Do(req)
x(err)
body, err := io.ReadAll(resp.Body)
x(err)
x(resp.Body.Close())
doc, err := gokogiri.ParseHtml(body)
x(err)
root := doc.Root()
fp, err := os.Create(filename + ".txt")
x(err)
_, err = fp.WriteString(addEnd(title))
x(err)
count := 0
pp, err := root.Search(`//div[@id="zone_intro"]//div[contains(@class, "inleiding")]/p`)
x(err)
for _, p := range pp {
_, err = fp.WriteString(addEnd(p.Content()))
x(err)
count++
}
ell, err := root.Search(`//div[@id="zone_content"]//div[contains(@class, "tekst")]/child::*`)
x(err)
for _, el := range ell {
if n := el.Name(); n == "p" || n == "h3" {
_, err = fp.WriteString(addEnd(el.Content()))
count++
x(err)
}
}
x(fp.Close())
x(os.Chtimes(filename+".txt", timestamp, timestamp))
if count == 0 {
fp, err := os.Create(filename + ".debug.html")
x(err)
_, err = fp.Write(body)
x(err)
x(fp.Close())
x(os.Chtimes(filename+".debug.html", timestamp, timestamp))
}
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}

View File

@@ -78,6 +78,9 @@ func main() {
x(err) x(err)
year, week := t.ISOWeek() year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/GG/%d/%02d", year, week) dirname := fmt.Sprintf("/net/corpora/nlnieuws/GG/%d/%02d", year, week)
if exists(dirname + "/lock") {
continue
}
filename := dirname + "/" + url.PathEscape(item.Guid) filename := dirname + "/" + url.PathEscape(item.Guid)
ts := fmt.Sprintf("%d", t.Unix()) ts := fmt.Sprintf("%d", t.Unix())

View File

@@ -9,7 +9,7 @@ export TZ=Europe/Amsterdam
if [ "$1" = "" ] if [ "$1" = "" ]
then then
ds=`ISODate -7` ds=`ISOWeek -7`
else else
case "$1" in case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9]) 2[0-9][0-9][0-9]-[0-5][0-9])

View File

@@ -77,6 +77,9 @@ func main() {
x(err) x(err)
year, week := t.ISOWeek() year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NOS/%d/%02d", year, week) dirname := fmt.Sprintf("/net/corpora/nlnieuws/NOS/%d/%02d", year, week)
if exists(dirname + "/lock") {
continue
}
filename := dirname + "/" + url.PathEscape(strings.TrimPrefix(item.Guid, "https://nos.nl/l/")) filename := dirname + "/" + url.PathEscape(strings.TrimPrefix(item.Guid, "https://nos.nl/l/"))
ts := fmt.Sprintf("%d", t.Unix()) ts := fmt.Sprintf("%d", t.Unix())

View File

@@ -9,7 +9,7 @@ export TZ=Europe/Amsterdam
if [ "$1" = "" ] if [ "$1" = "" ]
then then
ds=`ISODate -7` ds=`ISOWeek -7`
else else
case "$1" in case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9]) 2[0-9][0-9][0-9]-[0-5][0-9])

View File

@@ -82,6 +82,9 @@ func main() {
x(err) x(err)
year, week := t.ISOWeek() year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NU/%d/%02d", year, week) dirname := fmt.Sprintf("/net/corpora/nlnieuws/NU/%d/%02d", year, week)
if exists(dirname + "/lock") {
continue
}
filename := dirname + "/" + url.PathEscape(item.Guid) filename := dirname + "/" + url.PathEscape(item.Guid)
ts := fmt.Sprintf("%d", t.Unix()) ts := fmt.Sprintf("%d", t.Unix())

View File

@@ -9,7 +9,7 @@ export TZ=Europe/Amsterdam
if [ "$1" = "" ] if [ "$1" = "" ]
then then
ds=`ISODate -7` ds=`ISOWeek -7`
else else
case "$1" in case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9]) 2[0-9][0-9][0-9]-[0-5][0-9])

View File

@@ -77,6 +77,9 @@ func main() {
} }
x(err) x(err)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/NieuwsNL/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day()) dirname := fmt.Sprintf("/net/corpora/nlnieuws/NieuwsNL/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day())
if exists(dirname + "/lock") {
continue
}
filename := dirname + "/" + url.PathEscape(strings.TrimPrefix(item.Guid, "urn:uuid:")) filename := dirname + "/" + url.PathEscape(strings.TrimPrefix(item.Guid, "urn:uuid:"))
ts := fmt.Sprintf("%d", t.Unix()) ts := fmt.Sprintf("%d", t.Unix())

View File

@@ -11,7 +11,7 @@ if [ "$1" = "" ]
then then
# nieuws.nl gaat per dag, niet per week # nieuws.nl gaat per dag, niet per week
# dus gegevens van 2 dagen geleden, niet een week geleden # dus gegevens van 2 dagen geleden, niet een week geleden
ds=`ISODate -2` ds=`date -d -2days +%Y-%m-%d`
else else
case "$1" in case "$1" in
2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9]) 2[0-9][0-9][0-9]-[01][0-9]-[0-3][0-9])

View File

@@ -30,6 +30,11 @@ var (
agent = "AhrefsBot/7.0" agent = "AhrefsBot/7.0"
) )
func exists(filename string) bool {
_, err := os.Stat(filename)
return err == nil
}
func main() { func main() {
req, err := http.NewRequest("GET", "https://reportersonline.nl/feed/", nil) req, err := http.NewRequest("GET", "https://reportersonline.nl/feed/", nil)
x(err) x(err)
@@ -57,6 +62,9 @@ func main() {
x(err) x(err)
year, week := t.ISOWeek() year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/RO/%d/%02d", year, week) dirname := fmt.Sprintf("/net/corpora/nlnieuws/RO/%d/%02d", year, week)
if exists(dirname + "/lock") {
continue
}
basename := strings.TrimPrefix(item.Guid, "https://reportersonline.nl/?p=") basename := strings.TrimPrefix(item.Guid, "https://reportersonline.nl/?p=")
if i := strings.LastIndex(basename, "/"); i > 0 { if i := strings.LastIndex(basename, "/"); i > 0 {
basename = basename[:i] basename = basename[:i]

View File

@@ -9,7 +9,7 @@ export TZ=Europe/Amsterdam
if [ "$1" = "" ] if [ "$1" = "" ]
then then
ds=`ISODate -7` ds=`ISOWeek -7`
else else
case "$1" in case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9]) 2[0-9][0-9][0-9]-[0-5][0-9])

View File

@@ -30,6 +30,11 @@ var (
agent = "AhrefsBot/7.0" agent = "AhrefsBot/7.0"
) )
func exists(filename string) bool {
_, err := os.Stat(filename)
return err == nil
}
func main() { func main() {
req, err := http.NewRequest("GET", "https://sargasso.nl/feed/", nil) req, err := http.NewRequest("GET", "https://sargasso.nl/feed/", nil)
x(err) x(err)
@@ -57,6 +62,9 @@ func main() {
x(err) x(err)
year, week := t.ISOWeek() year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sargasso/%d/%02d", year, week) dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sargasso/%d/%02d", year, week)
if exists(dirname + "/lock") {
continue
}
basename := strings.TrimPrefix(item.Guid, "https://sargasso.nl/?") basename := strings.TrimPrefix(item.Guid, "https://sargasso.nl/?")
if i := strings.LastIndex(basename, "p="); i >= 0 { if i := strings.LastIndex(basename, "p="); i >= 0 {
basename = basename[i+2:] basename = basename[i+2:]

View File

@@ -9,7 +9,7 @@ export TZ=Europe/Amsterdam
if [ "$1" = "" ] if [ "$1" = "" ]
then then
ds=`ISODate -7` ds=`ISOWeek -7`
else else
case "$1" in case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9]) 2[0-9][0-9][0-9]-[0-5][0-9])

View File

@@ -72,6 +72,9 @@ func main() {
x(err) x(err)
year, week := t.ISOWeek() year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sikkom/%d/%02d", year, week) dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sikkom/%d/%02d", year, week)
if exists(dirname + "/lock") {
continue
}
filename := dirname + "/" + url.PathEscape(item.Guid) filename := dirname + "/" + url.PathEscape(item.Guid)
ts := fmt.Sprintf("%d", t.Unix()) ts := fmt.Sprintf("%d", t.Unix())

View File

@@ -9,7 +9,7 @@ export TZ=Europe/Amsterdam
if [ "$1" = "" ] if [ "$1" = "" ]
then then
ds=`ISODate -7` ds=`ISOWeek -7`
else else
case "$1" in case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9]) 2[0-9][0-9][0-9]-[0-5][0-9])

View File

@@ -30,6 +30,11 @@ var (
agent = "AhrefsBot/7.0" agent = "AhrefsBot/7.0"
) )
func exists(filename string) bool {
_, err := os.Stat(filename)
return err == nil
}
func main() { func main() {
req, err := http.NewRequest("GET", "https://www.tzum.info/feed/", nil) req, err := http.NewRequest("GET", "https://www.tzum.info/feed/", nil)
x(err) x(err)
@@ -57,6 +62,9 @@ func main() {
x(err) x(err)
year, week := t.ISOWeek() year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/Tzum/%d/%02d", year, week) dirname := fmt.Sprintf("/net/corpora/nlnieuws/Tzum/%d/%02d", year, week)
if exists(dirname + "/lock") {
continue
}
basename := strings.TrimPrefix(item.Guid, "https://www.tzum.info/?p=") basename := strings.TrimPrefix(item.Guid, "https://www.tzum.info/?p=")
if i := strings.LastIndex(basename, "/"); i > 0 { if i := strings.LastIndex(basename, "/"); i > 0 {
basename = basename[:i] basename = basename[:i]

View File

@@ -9,7 +9,7 @@ export TZ=Europe/Amsterdam
if [ "$1" = "" ] if [ "$1" = "" ]
then then
ds=`ISODate -7` ds=`ISOWeek -7`
else else
case "$1" in case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9]) 2[0-9][0-9][0-9]-[0-5][0-9])

View File

@@ -94,6 +94,9 @@ func main() {
} }
year, week := t.ISOWeek() year, week := t.ISOWeek()
dirname := fmt.Sprintf("/net/corpora/nlnieuws/VRT/%d/%02d", year, week) dirname := fmt.Sprintf("/net/corpora/nlnieuws/VRT/%d/%02d", year, week)
if exists(dirname + "/lock") {
continue
}
filename := dirname + "/" + url.PathEscape(strings.TrimPrefix(item.ID, "https://vrtnws.be/")) filename := dirname + "/" + url.PathEscape(strings.TrimPrefix(item.ID, "https://vrtnws.be/"))
ts := fmt.Sprintf("%d", t.Unix()) ts := fmt.Sprintf("%d", t.Unix())

View File

@@ -9,7 +9,7 @@ export TZ=Europe/Amsterdam
if [ "$1" = "" ] if [ "$1" = "" ]
then then
ds=`ISODate -7` ds=`ISOWeek -7`
else else
case "$1" in case "$1" in
2[0-9][0-9][0-9]-[0-5][0-9]) 2[0-9][0-9][0-9]-[0-5][0-9])