package main import ( e "codeberg.org/pebbe/errors" "github.com/jbowtie/gokogiri" u "git.web.rug.nl/p209327/nlnieuws/internal/util" "bytes" "encoding/xml" "fmt" "io" "net/http" "net/url" "os" "strings" "time" ) type Rss struct { XMLName xml.Name `xml:"feed"` Items []ItemT `xml:"entry"` } type ItemT struct { Title TitleT `xml:"title"` Published string `xml:"published"` Updated string `xml:"updated"` Nstag []string `xml:"nstag"` Nslabeltag []string `xml:"nslabeltag"` UnixTime int64 `xml:"unixTime"` ID string `xml:"id"` Link []LinkT `xml:"link"` Data []byte `xml:",innerxml"` } type TitleT struct { Type string `xml:"type,attr"` Text string `xml:",chardata"` } type LinkT struct { Type string `xml:"type,attr"` Href string `xml:"href,attr"` } var ( p = e.PanicErr w = e.WarnErr // agent = "AhrefsBot/7.0" agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36" ) func exists(filename string) bool { _, err := os.Stat(filename) return err == nil } func fileDate(filename string) string { b, err := os.ReadFile(filename) if err != nil { return "" } s := string(b) i1 := strings.Index(s, "") + 10 i2 := strings.Index(s, "") if i2 < i1 { return "" } return s[i1:i2] } func main() { defer func() { if e.Panicked { _ = recover() os.Exit(1) } }() myLock := "/net/corpora/nlnieuws/VRT/lock" u.MkLock(myLock) defer func() { _ = os.Remove(myLock) }() resp, err := http.Get("https://www.vrt.be/vrtnws/nl.rss.headlines.xml") p(err) body, err := io.ReadAll(resp.Body) p(err) p(resp.Body.Close()) var rss Rss p(xml.Unmarshal(body, &rss)) if len(rss.Items) == 0 { p(fmt.Errorf("len(rss.Items) == 0")) } for _, item := range rss.Items { t, err := time.Parse(time.RFC3339Nano, item.Published) if err != nil { t, err = time.Parse(time.RFC1123, item.Published) } p(err) t2, err := time.Parse(time.RFC3339Nano, item.Updated) if err != nil { t2, _ = time.Parse(time.RFC1123, item.Updated) } if t2.After(t) { t = t2 } dirname := fmt.Sprintf("/net/corpora/nlnieuws/VRT/%d/%02d/%02d", t.Year(), int(t.Month()), t.Day()) if exists(dirname + "/lock") { continue } filename := dirname + "/" + url.PathEscape(strings.TrimPrefix(item.ID, "https://vrtnws.be/")) ts := fmt.Sprintf("%d", t.Unix()) needUpdate := fileDate(filename+".xml") != ts p(os.MkdirAll(dirname, 0777)) func() { var link string for _, l := range item.Link { if l.Type == "text/html" { link = l.Href } } var ok bool defer func() { if e.Panicked { fmt.Fprintln(os.Stderr, "----", filename) fmt.Fprintln(os.Stderr, "----", link) } if !ok { _ = os.Remove(filename + ".xml") } }() fp, err := os.Create(filename + ".xml") p(err) p(fp.WriteString("\n\n")) p(fmt.Fprintf(fp, "%d", t.Unix())) p(fp.Write(item.Data)) p(fp.WriteString("\n")) p(fp.Close()) p(os.Chtimes(filename+".xml", t, t)) ok = doArticle(filename, link, item.Title.Text, item.Nstag, item.Nslabeltag, t, needUpdate) }() } } // Nstag -> tag // Nslabeltag -> cat func doArticle(filename string, url string, title string, tags []string, cats []string, timestamp time.Time, needUpdate bool) bool { if exists(filename + ".skip") { return true } if needUpdate { _ = os.Remove(filename + ".err") _ = os.Remove(filename + ".txt") _ = os.Remove(filename + ".html") } else { if exists(filename + ".txt") { return true } } time.Sleep(2 * time.Second) req, err := http.NewRequest("GET", url, nil) p(err) req.Header.Set("User-Agent", agent) client := &http.Client{} resp, err := client.Do(req) p(err) body, err := io.ReadAll(resp.Body) p(err) p(resp.Body.Close()) body = u.HtmlFix(body) /* s := string(body) ok := true i1 := strings.Index(s, `type="application/ld+json"`) if i1 < 0 { ok = false } else { i1 += strings.Index(s[i1:], `>`) + 1 i2 := i1 + strings.Index(s[i1:], ``) if i2 < i1 { ok = false } else { s = html.UnescapeString([i1:i2]) } } if !ok { _ = w(fmt.Errorf("script jsonld not found: %s", url)) fp, err := os.Create(filename + ".err") p(err) p(fmt.Fprintf(fp, "script jsonld not found: %s\n", url)) p(fp.Close()) p(os.Chtimes(filename+".err", timestamp, timestamp)) fp, err = os.Create(filename + ".html") p(err) p(fp.Write(body)) p(fp.Close()) p(os.Chtimes(filename+".html", timestamp, timestamp)) return false } fp, err := os.Create(filename + ".json") p(err) p(fp.WriteString(s)) p(fp.Close()) p(os.Chtimes(filename+".json", timestamp, timestamp)) */ var buf bytes.Buffer doc, err := gokogiri.ParseHtml(body) p(err) root := doc.Root() lnn, err := root.Search(`//head/link[@rel="canonical"]/@href`) p(err) for _, ln := range lnn { if strings.Contains(ln.String(), "/liveblog/") { fp, err := os.Create(filename + ".skip") p(err) p(fp.WriteString("liveblog\n")) p(fp.Close()) p(os.Chtimes(filename+".skip", timestamp, timestamp)) return true } } if len(cats) == 0 { p(fmt.Fprintln(&buf, "##META text cat =")) } else { for _, cat := range cats { p(fmt.Fprintf(&buf, "##META text cat = %s\n", u.FixSpace(cat))) } } if len(tags) == 0 { p(fmt.Fprintln(&buf, "##META text tag =")) } else { for _, tag := range tags { p(fmt.Fprintf(&buf, "##META text tag = %s\n", u.FixSpace(tag))) } } _, err = buf.WriteString(u.AddEnd(u.FixSpace(title))) p(err) fouten := make([]string, 0) found := false pp, err := root.Search(`//div[@data-sentry-component="ArticleHeading"]//*[contains(@class,"prose-article-body-r")]`) p(err) for _, p1 := range pp { p(fmt.Fprint(&buf, u.AddEnd(u.FixSpace(p1.Content())))) found = true } if !found { fouten = append(fouten, fmt.Sprintf("no heading: %s\n", url)) _ = w(fmt.Errorf("no heading: %s", url)) } found = false pp, err = root.Search( `//div[@data-sentry-component="ArticleText"]//*[contains(@class,"prose-article-body-r")]` + ` | ` + `//div[@data-sentry-component="ArticleTitle"]//h2`) p(err) for _, p1 := range pp { p(fmt.Fprint(&buf, u.AddEnd(u.FixSpace(p1.Content())))) found = true } if !found { fouten = append(fouten, fmt.Sprintf("no text: %s\n", url)) _ = w(fmt.Errorf("no text: %s", url)) } if len(fouten) > 0 { fp, err := os.Create(filename + ".err") p(err) for _, fout := range fouten { p(fp.WriteString(fout)) } p(fp.Close()) p(os.Chtimes(filename+".err", timestamp, timestamp)) fp, err = os.Create(filename + ".html") p(err) p(fp.Write(body)) p(fp.Close()) p(os.Chtimes(filename+".html", timestamp, timestamp)) return false } fp, err := os.Create(filename + ".txt") p(err) p(fp.Write(buf.Bytes())) p(fp.Close()) p(os.Chtimes(filename+".txt", timestamp, timestamp)) return true }