codeberg.org/pebbe/errors

This commit is contained in:
Peter Kleiweg
2026-03-16 13:41:03 +01:00
parent 7b18c51567
commit 78dc580c8d
40 changed files with 706 additions and 692 deletions

View File

@@ -1,8 +1,8 @@
package main
import (
e "codeberg.org/pebbe/errors"
"github.com/jbowtie/gokogiri"
"github.com/pebbe/util"
"encoding/xml"
"fmt"
@@ -29,7 +29,7 @@ type ItemT struct {
}
var (
x = util.CheckErr
p = e.PanicErr
agent = "AhrefsBot/7.0"
// agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36"
)
@@ -40,22 +40,29 @@ func exists(filename string) bool {
}
func main() {
defer func() {
if e.Panicked() {
_ = recover()
os.Exit(1)
}
}()
req, err := http.NewRequest("GET", "https://www.amsterdam.nl/nieuws/nieuwsoverzicht/?rss=true", nil)
x(err)
p(err)
req.Header.Set("User-Agent", agent)
client := &http.Client{}
resp, err := client.Do(req)
x(err)
p(err)
body, err := io.ReadAll(resp.Body)
x(err)
x(resp.Body.Close())
p(err)
p(resp.Body.Close())
var rss Rss
x(xml.Unmarshal(body, &rss))
p(xml.Unmarshal(body, &rss))
if len(rss.Items) == 0 {
x(fmt.Errorf("len(rss.Items) == 0"))
p(fmt.Errorf("len(rss.Items) == 0"))
}
for _, item := range rss.Items {
@@ -63,26 +70,31 @@ func main() {
if err != nil {
t, err = time.Parse(time.RFC1123, item.PubDate)
}
x(err)
p(err)
dirname := fmt.Sprintf("/net/corpora/nlnieuws/amsterdam/%d/%02d", t.Year(), int(t.Month()))
if exists(dirname + "/lock") {
continue
}
filename := dirname + "/" + url.PathEscape(strings.TrimPrefix(item.Guid, "https://www.amsterdam.nl/nieuws/"))
x(os.MkdirAll(dirname, 0777))
fp, err := os.Create(filename + ".xml")
x(err)
_, err = fp.WriteString("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<item>\n")
x(err)
_, err = fmt.Fprintf(fp, "<unixTime>%d</unixTime>", t.Unix())
x(err)
_, err = fp.Write(item.Data)
x(err)
_, err = fp.WriteString("</item>\n")
x(err)
x(fp.Close())
x(os.Chtimes(filename+".xml", t, t))
doArticle(filename, item.Link, item.Title, t)
p(os.MkdirAll(dirname, 0777))
func() {
var ok bool
defer func() {
if !ok {
_ = os.Remove(filename + ".xml")
}
}()
fp, err := os.Create(filename + ".xml")
p(err)
p(fp.WriteString("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<item>\n"))
p(fmt.Fprintf(fp, "<unixTime>%d</unixTime>", t.Unix()))
p(fp.Write(item.Data))
p(fp.WriteString("</item>\n"))
p(fp.Close())
p(os.Chtimes(filename+".xml", t, t))
doArticle(filename, item.Link, item.Title, t)
ok = true
}()
}
}
@@ -93,58 +105,54 @@ func doArticle(filename string, url string, title string, timestamp time.Time) {
time.Sleep(2 * time.Second)
req, err := http.NewRequest("GET", url, nil)
x(err)
p(err)
req.Header.Set("User-Agent", agent)
client := &http.Client{}
resp, err := client.Do(req)
x(err)
p(err)
body, err := io.ReadAll(resp.Body)
x(err)
x(resp.Body.Close())
p(err)
p(resp.Body.Close())
doc, err := gokogiri.ParseHtml(body)
x(err)
p(err)
root := doc.Root()
fp, err := os.Create(filename + ".txt")
x(err)
p(err)
_, err = fp.WriteString(addEnd(title))
x(err)
p(fp.WriteString(addEnd(title)))
count := 0
pp, err := root.Search(`//div[@id="zone_intro"]//div[contains(@class, "inleiding")]/p`)
x(err)
for _, p := range pp {
_, err = fp.WriteString(addEnd(p.Content()))
x(err)
p(err)
for _, p1 := range pp {
p(fp.WriteString(addEnd(p1.Content())))
count++
}
ell, err := root.Search(`//div[@id="zone_content"]//div[contains(@class, "tekst")]/child::*`)
x(err)
p(err)
for _, el := range ell {
if n := el.Name(); n == "p" || n == "h3" {
_, err = fp.WriteString(addEnd(el.Content()))
p(fp.WriteString(addEnd(el.Content())))
count++
x(err)
}
}
x(fp.Close())
p(fp.Close())
x(os.Chtimes(filename+".txt", timestamp, timestamp))
p(os.Chtimes(filename+".txt", timestamp, timestamp))
if count == 0 {
fp, err := os.Create(filename + ".debug.html")
x(err)
_, err = fp.Write(body)
x(err)
x(fp.Close())
x(os.Chtimes(filename+".debug.html", timestamp, timestamp))
p(err)
p(fp.Write(body))
p(fp.Close())
p(os.Chtimes(filename+".debug.html", timestamp, timestamp))
}
}