diff --git a/Parool/cmd/parool/parool.go b/Parool/cmd/parool/parool.go index 1ab5502..b092389 100644 --- a/Parool/cmd/parool/parool.go +++ b/Parool/cmd/parool/parool.go @@ -182,6 +182,9 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool tags := make([]string, 0) ell, err := article.Search(`//header//*[@data-test-id="article-label"]`) p(err) + if len(ell) == 0 { + _ = w(fmt.Errorf("no labels: %s", url)) + } for _, el := range ell { s := strings.TrimSpace(el.Content()) if s != "" { @@ -189,11 +192,13 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool } } - fouten := make([]string, 0) pars := make([]string, 0) ell, err = article.Search(`//header//*[@data-test-id="article-title"]`) p(err) + if len(ell) != 1 { + _ = w(fmt.Errorf("found %d titles: %s", len(ell), url)) + } for _, el := range ell { s := strings.TrimSpace(el.Content()) if s != "" { @@ -212,8 +217,7 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool } } if !found { - fouten = append(fouten, fmt.Sprintf("no heading: %s\n", url)) - _ = w(fmt.Errorf("no heading: %s", url)) + _ = w(fmt.Errorf("no intro: %s", url)) } specials, err := article.Search(`//section//aside | //section//figure | //section//b`) @@ -222,27 +226,14 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool special.Remove() } - found = false ell, err = article.Search(`//section//*[@data-article-element-index]`) p(err) - for _, el := range ell { - s := strings.TrimSpace(el.Content()) - if s != "" { - pars = append(pars, s) - found = true - } - } - if !found { - fouten = append(fouten, fmt.Sprintf("no text: %s\n", url)) - _ = w(fmt.Errorf("no text: %s", url)) - } + if len(ell) == 0 { + _ = w(fmt.Errorf("no elements: %s", url)) - if len(fouten) > 0 { fp, err := os.Create(filename + ".err") p(err) - for _, fout := range fouten { - p(fp.WriteString(fout)) - } + p(fmt.Fprintf(fp, "no elements: %s\n", url)) p(fp.Close()) p(os.Chtimes(filename+".err", timestamp, timestamp)) @@ -255,6 +246,30 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool return false } + found = false + for _, el := range ell { + s := strings.TrimSpace(el.Content()) + if s != "" { + pars = append(pars, s) + found = true + } + } + if !found { + _ = w(fmt.Errorf("no text, skipping: %s", url)) + fp, err := os.Create(filename + ".skip") + p(fp.WriteString(url + "\n")) + p(err) + p(os.Chtimes(filename+".skip", timestamp, timestamp)) + + fp, err = os.Create(filename + ".html") + p(err) + p(fp.Write(body)) + p(fp.Close()) + p(os.Chtimes(filename+".html", timestamp, timestamp)) + + return true + } + fp, err := os.Create(filename + ".txt") p(err)