package main import ( e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" u "git.web.rug.nl/p209327/nlnieuws/internal/util" //"encoding/json" "encoding/xml" "fmt" //"html" "io" "net/http" "net/url" "os" "strings" "time" ) type Rss struct { XMLName xml.Name `xml:"rss"` Items []ItemT `xml:"channel>item"` } type ItemT struct { PubDate string `xml:"pubDate"` UnixTime int64 `xml:"unixTime"` Guid string `xml:"guid"` Link string `xml:"link"` Title string `xml:"title"` Data []byte `xml:",innerxml"` } /* type GraphT struct { Graph []map[string]any `json:"@graph"` } */ var ( p = e.PanicErr w = e.WarnErr agent = "AhrefsBot/7.0" ) func exists(filename string) bool { _, err := os.Stat(filename) return err == nil } func fileDate(filename string) string { b, err := os.ReadFile(filename) if err != nil { return "" } s := string(b) i1 := strings.Index(s, "") + 10 i2 := strings.Index(s, "") if i2 < i1 { return "" } return s[i1:i2] } func main() { defer func() { if e.Panicked { _ = recover() os.Exit(1) } }() myLock := "/net/corpora/nlnieuws/Parool/lock" u.MkLock(myLock) defer func() { _ = os.Remove(myLock) }() req, err := http.NewRequest("GET", "https://www.parool.nl/amsterdam/rss.xml", nil) p(err) req.Header.Set("User-Agent", agent) client := &http.Client{} resp, err := client.Do(req) p(err) body, err := io.ReadAll(resp.Body) p(err) p(resp.Body.Close()) var rss Rss p(xml.Unmarshal(body, &rss)) if len(rss.Items) == 0 { p(fmt.Errorf("len(rss.Items) == 0")) } for _, item := range rss.Items { t, err := time.Parse(time.RFC1123Z, item.PubDate) if err != nil { t, err = time.Parse(time.RFC1123, item.PubDate) } p(err) year, week := t.ISOWeek() dirname := fmt.Sprintf("/net/corpora/nlnieuws/Parool/%d/w%02d", year, week) if exists(dirname + "/lock") { continue } basename := item.Guid filename := dirname + "/" + url.PathEscape(basename) ts := fmt.Sprintf("%d", t.Unix()) needUpdate := fileDate(filename+".xml") != ts p(os.MkdirAll(dirname, 0777)) func() { var ok bool defer func() { if e.Panicked { fmt.Fprintln(os.Stderr, "----", filename) fmt.Fprintln(os.Stderr, "----", item.Link) } if !ok { _ = os.Remove(filename + ".xml") } }() fp, err := os.Create(filename + ".xml") p(err) p(fp.WriteString("\n\n")) p(fmt.Fprintf(fp, "%d", t.Unix())) p(fp.Write(item.Data)) p(fp.WriteString("\n")) p(fp.Close()) p(os.Chtimes(filename+".xml", t, t)) ok = doArticle(filename, item.Link, item.Title, t, needUpdate) }() } } func doArticle(filename string, url string, title string, timestamp time.Time, needUpdate bool) (ok bool) { if exists(filename + ".skip") { return true } if needUpdate { _ = os.Remove(filename + ".err") _ = os.Remove(filename + ".html") // _ = os.Remove(filename + ".json") _ = os.Remove(filename + ".txt") } else { if exists(filename + ".txt") { return true } } time.Sleep(2 * time.Second) req, err := http.NewRequest("GET", url, nil) p(err) req.Header.Set("User-Agent", agent) client := &http.Client{} resp, err := client.Do(req) p(err) body, err := io.ReadAll(resp.Body) p(err) p(resp.Body.Close()) doc, err := gokogiri.ParseHtml(body) p(err) /* s := string(body) ok = true i1 := strings.Index(s, ``) if i2 < i1 { ok = false } else { s = html.UnescapeString(s[i1:i2]) } } if !ok { _ = w(fmt.Errorf("script jsonld not found: %s", url)) fp, err := os.Create(filename + ".err") p(err) p(fmt.Fprintf(fp, "script jsonld not found: %s\n", url)) p(fp.Close()) p(os.Chtimes(filename+".err", timestamp, timestamp)) fp, err = os.Create(filename + ".html") p(err) p(fp.Write(body)) p(fp.Close()) p(os.Chtimes(filename+".html", timestamp, timestamp)) return false } var graph GraphT p(json.Unmarshal([]byte(s), &graph)) for _, g := range graph.Graph { t := g["@type"] switch v := t.(type) { case string: if v == "NewsArticle" { b, err := json.Marshal(g) p(err) s = string(b) } } } fp, err := os.Create(filename + ".json") p(err) p(fp.WriteString(s)) p(fp.Close()) p(os.Chtimes(filename+".json", timestamp, timestamp)) */ root := doc.Root() articles, err := root.Search(`//article[@id="article-content"]`) p(err) if len(articles) == 0 { _ = w(fmt.Errorf("empty: %s", url)) fp, err := os.Create(filename + ".err") p(err) p(fmt.Fprintf(fp, "empty: %s\n", url)) p(fp.Close()) p(os.Chtimes(filename+".err", timestamp, timestamp)) fp, err = os.Create(filename + ".html") p(err) p(fp.Write(body)) p(fp.Close()) p(os.Chtimes(filename+".html", timestamp, timestamp)) return false } article := articles[0] live, err := article.Search(`.//*[@data-test-id="live-blog-label"]`) p(err) if len(live) > 0 { fp, err := os.Create(filename + ".skip") p(fp.WriteString("liveblog\n")) p(err) p(os.Chtimes(filename+".skip", timestamp, timestamp)) return true } headers, err := article.Search(`.//header`) p(err) if len(headers) == 0 { _ = w(fmt.Errorf("no header: %s", url)) fp, err := os.Create(filename + ".err") p(err) p(fmt.Fprintf(fp, "no elements: %s\n", url)) p(fp.Close()) p(os.Chtimes(filename+".err", timestamp, timestamp)) fp, err = os.Create(filename + ".html") p(err) p(fp.Write(body)) p(fp.Close()) p(os.Chtimes(filename+".html", timestamp, timestamp)) return false } header := headers[0] tags := make([]string, 0) ell, err := header.Search(`.//*[@data-test-id="article-label"]`) p(err) if len(ell) == 0 { _ = w(fmt.Errorf("no labels: %s", url)) } for _, el := range ell { s := strings.TrimSpace(el.Content()) if s != "" && s != "Nieuws" { tags = append(tags, s) } } pars := make([]string, 0) found := false ell, err = header.Search(`.//*[@data-test-id="header-intro"]`) p(err) for _, el := range ell { s := strings.TrimSpace(el.Content()) if s != "" { pars = append(pars, s) found = true } } if !found { _ = w(fmt.Errorf("no intro: %s", url)) } specials, err := article.Search(`.//section//aside | .//section//figure | .//section//b`) p(err) for _, special := range specials { special.Remove() } ell, err = article.Search(`.//section//*[@data-article-element-index]`) p(err) if len(ell) == 0 { _ = w(fmt.Errorf("no elements: %s", url)) fp, err := os.Create(filename + ".err") p(err) p(fmt.Fprintf(fp, "no elements: %s\n", url)) p(fp.Close()) p(os.Chtimes(filename+".err", timestamp, timestamp)) fp, err = os.Create(filename + ".html") p(err) p(fp.Write(body)) p(fp.Close()) p(os.Chtimes(filename+".html", timestamp, timestamp)) return false } found = false for _, el := range ell { s := strings.TrimSpace(el.Content()) if s != "" { pars = append(pars, s) found = true } } if !found { _ = w(fmt.Errorf("no text, skipping: %s", url)) fp, err := os.Create(filename + ".skip") p(fp.WriteString(url + "\n")) p(err) p(os.Chtimes(filename+".skip", timestamp, timestamp)) fp, err = os.Create(filename + ".html") p(err) p(fp.Write(body)) p(fp.Close()) p(os.Chtimes(filename+".html", timestamp, timestamp)) return true } fp, err := os.Create(filename + ".txt") p(err) if len(tags) == 0 { p(fmt.Fprintln(fp, "##META text tag =")) } else { for _, tag := range tags { p(fmt.Fprintf(fp, "##META text tag = %s\n", u.FixSpace(tag))) } } p(fp.WriteString(u.AddEnd(u.FixSpace(title)))) for _, par := range pars { p(fp.WriteString(u.AddEnd(u.FixSpace(par)))) } p(fp.Close()) p(os.Chtimes(filename+".txt", timestamp, timestamp)) return true }