package main import ( e "codeberg.org/pebbe/errors" u "git.web.rug.nl/p209327/nlnieuws/internal/util" "encoding/xml" "fmt" "io" "net/http" "net/url" "os" "strings" "time" ) type Rss struct { XMLName xml.Name `xml:"rss"` Items []ItemT `xml:"channel>item"` } type ItemT struct { PubDate string `xml:"pubDate"` UnixTime int64 `xml:"unixTime"` Guid string `xml:"guid"` Link string `xml:"link"` Data []byte `xml:",innerxml"` } var ( p = e.PanicErr agent = "AhrefsBot/7.0" ) func exists(filename string) bool { _, err := os.Stat(filename) return err == nil } func main() { defer func() { if e.Panicked { _ = recover() os.Exit(1) } }() myLock := "/net/corpora/nlnieuws/Sargasso/lock" u.MkLock(myLock) defer func() { _ = os.Remove(myLock) }() req, err := http.NewRequest("GET", "https://sargasso.nl/feed/", nil) p(err) req.Header.Set("User-Agent", agent) client := &http.Client{} resp, err := client.Do(req) p(err) body, err := io.ReadAll(resp.Body) p(err) p(resp.Body.Close()) var rss Rss p(xml.Unmarshal(body, &rss)) if len(rss.Items) == 0 { p(fmt.Errorf("len(rss.Items) == 0")) } for _, item := range rss.Items { t, err := time.Parse(time.RFC1123Z, item.PubDate) if err != nil { t, err = time.Parse(time.RFC1123, item.PubDate) } p(err) year, week := t.ISOWeek() dirname := fmt.Sprintf("/net/corpora/nlnieuws/Sargasso/%d/w%02d", year, week) if exists(dirname + "/lock") { continue } basename := strings.TrimPrefix(item.Guid, "https://sargasso.nl/?") if i := strings.LastIndex(basename, "p="); i >= 0 { basename = basename[i+2:] } filename := dirname + "/" + url.PathEscape(basename) p(os.MkdirAll(dirname, 0777)) func() { var ok bool defer func() { if e.Panicked { fmt.Fprintln(os.Stderr, "----", filename) fmt.Fprintln(os.Stderr, "----", item.Link) } if !ok { _ = os.Remove(filename + ".xml") } }() fp, err := os.Create(filename + ".xml") p(err) p(fp.WriteString("\n\n")) p(fmt.Fprintf(fp, "%d", t.Unix())) p(fp.Write(item.Data)) p(fp.WriteString("\n")) p(fp.Close()) p(os.Chtimes(filename+".xml", t, t)) ok = true }() } }