Files
nlnieuws/Amsterdam/amsterdam.go
Peter Kleiweg 9b65fa8efa update
2026-03-17 12:32:46 +01:00

181 lines
3.7 KiB
Go

package main
import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
"encoding/xml"
"fmt"
"io"
"net/http"
"net/url"
"os"
"strings"
"time"
)
type Rss struct {
XMLName xml.Name `xml:"rss"`
Items []ItemT `xml:"channel>item"`
}
type ItemT struct {
Title string `xml:"title"`
PubDate string `xml:"pubDate"`
UnixTime int64 `xml:"unixTime"`
Guid string `xml:"guid"`
Link string `xml:"link"`
Data []byte `xml:",innerxml"`
}
var (
p = e.PanicErr
agent = "AhrefsBot/7.0"
// agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36"
)
func exists(filename string) bool {
_, err := os.Stat(filename)
return err == nil
}
func main() {
defer func() {
if e.Panicked {
_ = recover()
os.Exit(1)
}
}()
req, err := http.NewRequest("GET", "https://www.amsterdam.nl/nieuws/nieuwsoverzicht/?rss=true", nil)
p(err)
req.Header.Set("User-Agent", agent)
client := &http.Client{}
resp, err := client.Do(req)
p(err)
body, err := io.ReadAll(resp.Body)
p(err)
p(resp.Body.Close())
var rss Rss
p(xml.Unmarshal(body, &rss))
if len(rss.Items) == 0 {
p(fmt.Errorf("len(rss.Items) == 0"))
}
for _, item := range rss.Items {
t, err := time.Parse(time.RFC1123Z, item.PubDate)
if err != nil {
t, err = time.Parse(time.RFC1123, item.PubDate)
}
p(err)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/amsterdam/%d/%02d", t.Year(), int(t.Month()))
if exists(dirname + "/lock") {
continue
}
filename := dirname + "/" + url.PathEscape(strings.TrimPrefix(item.Guid, "https://www.amsterdam.nl/nieuws/"))
p(os.MkdirAll(dirname, 0777))
func() {
var ok bool
defer func() {
if e.Panicked {
fmt.Fprintln(os.Stderr, "----", filename, "----")
}
if !ok {
_ = os.Remove(filename + ".xml")
}
}()
fp, err := os.Create(filename + ".xml")
p(err)
p(fp.WriteString("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<item>\n"))
p(fmt.Fprintf(fp, "<unixTime>%d</unixTime>", t.Unix()))
p(fp.Write(item.Data))
p(fp.WriteString("</item>\n"))
p(fp.Close())
p(os.Chtimes(filename+".xml", t, t))
doArticle(filename, item.Link, item.Title, t)
ok = true
}()
}
}
func doArticle(filename string, url string, title string, timestamp time.Time) {
if exists(filename + ".txt") {
return
}
time.Sleep(2 * time.Second)
req, err := http.NewRequest("GET", url, nil)
p(err)
req.Header.Set("User-Agent", agent)
client := &http.Client{}
resp, err := client.Do(req)
p(err)
body, err := io.ReadAll(resp.Body)
p(err)
p(resp.Body.Close())
doc, err := gokogiri.ParseHtml(body)
p(err)
root := doc.Root()
fp, err := os.Create(filename + ".txt")
p(err)
p(fp.WriteString(addEnd(title)))
count := 0
pp, err := root.Search(`//div[@id="zone_intro"]//div[contains(@class, "inleiding")]/p`)
p(err)
for _, p1 := range pp {
p(fp.WriteString(addEnd(p1.Content())))
count++
}
ell, err := root.Search(`//div[@id="zone_content"]//div[contains(@class, "tekst")]/child::*`)
p(err)
for _, el := range ell {
if n := el.Name(); n == "p" || n == "h3" {
p(fp.WriteString(addEnd(el.Content())))
count++
}
}
p(fp.Close())
p(os.Chtimes(filename+".txt", timestamp, timestamp))
if count == 0 {
fp, err := os.Create(filename + ".debug.html")
p(err)
p(fp.Write(body))
p(fp.Close())
p(os.Chtimes(filename+".debug.html", timestamp, timestamp))
}
}
func addEnd(s string) string {
s = strings.TrimSpace(s)
n := len(s)
if n == 0 {
return ""
}
if n > 0 {
if strings.ContainsAny(s[n-1:], ".!?") {
return s + "\n"
}
}
if n > 1 {
s2 := s[n-2:]
if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` {
return s + "\n"
}
}
return s + ".\n"
}