package main import ( "github.com/jbowtie/gokogiri" "github.com/pebbe/util" "bytes" "encoding/xml" "fmt" "io" "net/http" "net/url" "os" "strings" "time" ) type Rss struct { XMLName xml.Name `xml:"rss"` Items []ItemT `xml:"channel>item"` } type ItemT struct { Title string `xml:"title"` PubDate string `xml:"pubDate"` UnixTime int64 `xml:"unixTime"` Guid string `xml:"guid"` Link string `xml:"link"` Data []byte `xml:",innerxml"` } var ( x = util.CheckErr w = util.WarnErr agent = "AhrefsBot/7.0" ) func exists(filename string) bool { _, err := os.Stat(filename) return err == nil } func fileDate(filename string) string { b, err := os.ReadFile(filename) if err != nil { return "" } s := string(b) i1 := strings.Index(s, "") + 10 i2 := strings.Index(s, "") return s[i1:i2] } func main() { req, err := http.NewRequest("GET", "https://nieuws.nl/sitemap/news.xml", nil) x(err) req.Header.Set("User-Agent", agent) client := &http.Client{} resp, err := client.Do(req) x(err) body, err := io.ReadAll(resp.Body) x(err) x(resp.Body.Close()) var rss Rss x(xml.Unmarshal(body, &rss)) if len(rss.Items) == 0 { x(fmt.Errorf("len(rss.Items) == 0")) } for _, item := range rss.Items { t, err := time.Parse(time.RFC1123Z, item.PubDate) if err != nil { t, err = time.Parse(time.RFC1123, item.PubDate) } x(err) dirname := fmt.Sprintf("/net/corpora/nlnieuws/NieuwsNL/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day()) filename := dirname + "/" + url.PathEscape(strings.TrimPrefix(item.Guid, "urn:uuid:")) ts := fmt.Sprintf("%d", t.Unix()) needUpdate := fileDate(filename+".xml") != ts x(os.MkdirAll(dirname, 0777)) fp, err := os.Create(filename + ".xml") x(err) _, err = fp.WriteString("\n\n") x(err) _, err = fmt.Fprintf(fp, "%d", t.Unix()) x(err) _, err = fp.Write(item.Data) x(err) _, err = fp.WriteString("\n") x(err) x(fp.Close()) x(os.Chtimes(filename+".xml", t, t)) if !doArticle(filename, item.Link, item.Title, t, needUpdate) { x(os.Remove(filename + ".xml")) } } } func doArticle(filename string, url string, title string, timestamp time.Time, needUpdate bool) bool { if needUpdate { _ = os.Remove(filename + ".err") _ = os.Remove(filename + ".html") _ = os.Remove(filename + ".txt") _ = os.Remove(filename + ".skip") } else { if exists(filename+".txt") || exists(filename+".skip") { return true } } time.Sleep(2 * time.Second) req, err := http.NewRequest("GET", url, nil) x(err) req.Header.Set("User-Agent", agent) client := &http.Client{} resp, err := client.Do(req) x(err) body, err := io.ReadAll(resp.Body) x(err) x(resp.Body.Close()) doc, err := gokogiri.ParseHtml(body) x(err) var buf bytes.Buffer fouten := make([]string, 0) root := doc.Root() var cat string aa, err := root.Search(`//a[contains(@class, "articleHeader__info__category")]`) x(err) if len(aa) == 0 { _, err = fmt.Fprintln(&buf, "##META text cat =") x(err) _ = w(fmt.Errorf("no cat: %s", url)) // geen fout, maar waarschuwing als er meer fouten zijn fouten = append(fouten, fmt.Sprintf("no text: %s\n", url)) // dus geen return false } else { for _, a := range aa { cat = strings.ReplaceAll(a.Content(), "\n", " ") _, err = fmt.Fprintf(&buf, "##META text cat = %s\n", cat) x(err) } } _, err = buf.WriteString(addEnd(title)) x(err) // oud: //div[@id="article-blocks"]//p pp, err := root.Search(`//div[@id="article-blocks"]//div[contains(@class, "paragraph-content")]`) x(err) if len(pp) == 0 { _ = w(fmt.Errorf("empty: %s", url)) // dit is echt fout fouten = append(fouten, fmt.Sprintf("empty: %s\n", url)) fp, err := os.Create(filename + ".err") x(err) for _, fout := range fouten { _, err = fp.WriteString(fout) x(err) } x(fp.Close()) x(os.Chtimes(filename+".err", timestamp, timestamp)) fp, err = os.Create(filename + ".html") x(err) _, err = fp.Write(body) x(err) x(fp.Close()) x(os.Chtimes(filename+".html", timestamp, timestamp)) return false // echt fout } for _, p := range pp { _, err = buf.WriteString(addEnd(p.Content())) x(err) } fp, err := os.Create(filename + ".txt") x(err) _, err = fp.Write(buf.Bytes()) x(err) x(fp.Close()) x(os.Chtimes(filename+".txt", timestamp, timestamp)) return true } func addEnd(s string) string { s = strings.TrimSpace(s) n := len(s) if n == 0 { return "" } if n > 0 { if strings.ContainsAny(s[n-1:], ".!?") { return s + "\n" } } if n > 1 { s2 := s[n-2:] if s2 == `."` || s2 == `!"` || s2 == `?"` || s2 == `.'` || s2 == `!'` || s2 == `?'` { return s + "\n" } } return s + ".\n" }