package main import ( e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" "encoding/xml" "fmt" "io" "net/http" "net/url" "os" "path/filepath" "strings" "time" ) type Rss struct { XMLName xml.Name `xml:"rss"` Items []ItemT `xml:"channel>item"` } type ItemT struct { Title string `xml:"title"` PubDate string `xml:"pubDate"` UnixTime int64 `xml:"unixTime"` Guid string `xml:"guid"` Link string `xml:"link"` Data []byte `xml:",innerxml"` } var ( p = e.PanicErr agent = "AhrefsBot/7.0" // agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36" ) func exists(filename string) bool { _, err := os.Stat(filename) return err == nil } func main() { defer func() { if e.Panicked { _ = recover() os.Exit(1) } }() myLock := "/net/corpora/nlnieuws/Amsterdam/lock" mkLock(myLock) defer func() { _ = os.Remove(myLock) }() req, err := http.NewRequest("GET", "https://www.amsterdam.nl/nieuws/nieuwsoverzicht/?rss=true", nil) p(err) req.Header.Set("User-Agent", agent) client := &http.Client{} resp, err := client.Do(req) p(err) body, err := io.ReadAll(resp.Body) p(err) p(resp.Body.Close()) var rss Rss p(xml.Unmarshal(body, &rss)) if len(rss.Items) == 0 { p(fmt.Errorf("len(rss.Items) == 0")) } for _, item := range rss.Items { t, err := time.Parse(time.RFC1123Z, item.PubDate) if err != nil { t, err = time.Parse(time.RFC1123, item.PubDate) } p(err) dirname := fmt.Sprintf("/net/corpora/nlnieuws/Amsterdam/%d/%02d", t.Year(), int(t.Month())) if exists(dirname + "/lock") { continue } filename := dirname + "/" + url.PathEscape(strings.TrimPrefix(item.Guid, "https://www.amsterdam.nl/nieuws/")) p(os.MkdirAll(dirname, 0777)) func() { var ok bool defer func() { if e.Panicked { fmt.Fprintln(os.Stderr, "----", filename, "----") } if !ok { _ = os.Remove(filename + ".xml") } }() fp, err := os.Create(filename + ".xml") p(err) p(fp.WriteString("\n\n")) p(fmt.Fprintf(fp, "%d", t.Unix())) p(fp.Write(item.Data)) p(fp.WriteString("\n")) p(fp.Close()) p(os.Chtimes(filename+".xml", t, t)) doArticle(filename, item.Link, item.Title, t) ok = true }() } } func doArticle(filename string, url string, title string, timestamp time.Time) { if exists(filename + ".txt") { return } time.Sleep(2 * time.Second) req, err := http.NewRequest("GET", url, nil) p(err) req.Header.Set("User-Agent", agent) client := &http.Client{} resp, err := client.Do(req) p(err) body, err := io.ReadAll(resp.Body) p(err) p(resp.Body.Close()) doc, err := gokogiri.ParseHtml(body) p(err) root := doc.Root() fp, err := os.Create(filename + ".txt") p(err) p(fp.WriteString(addEnd(title))) count := 0 pp, err := root.Search(`//div[@id="zone_intro"]//div[contains(@class, "inleiding")]/p`) p(err) for _, p1 := range pp { p(fp.WriteString(addEnd(p1.Content()))) count++ } ell, err := root.Search(`//div[@id="zone_content"]//div[contains(@class, "tekst")]/child::*`) p(err) for _, el := range ell { if n := el.Name(); n == "p" || n == "h3" { p(fp.WriteString(addEnd(el.Content()))) count++ } } p(fp.Close()) p(os.Chtimes(filename+".txt", timestamp, timestamp)) if count == 0 { fp, err := os.Create(filename + ".debug.html") p(err) p(fp.Write(body)) p(fp.Close()) p(os.Chtimes(filename+".debug.html", timestamp, timestamp)) } } func addEnd(s string) string { s = strings.TrimSpace(s) n := len(s) if n == 0 { return "" } if n > 0 { if strings.ContainsAny(s[n-1:], ".!?") { return s + "\n" } } if n > 1 { s2 := s[n-2:] if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { return s + "\n" } } return s + ".\n" } func mkLock(filename string) { pid := os.Getpid() link := fmt.Sprintf("%s.%d", filepath.Base(filename), pid) p(os.Symlink(link, filename)) name, err := os.Readlink(filename) p(err) if name != link { p(fmt.Errorf("wrong lock name %q, should be %q", name, link)) } }