cats en tags
This commit is contained in:
@@ -161,19 +161,19 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
|
||||
|
||||
root := doc.Root()
|
||||
|
||||
var cat string
|
||||
var tag string
|
||||
aa, err := root.Search(`//a[contains(@class, "articleHeader__info__category")]`)
|
||||
p(err)
|
||||
if len(aa) == 0 {
|
||||
p(fmt.Fprintln(&buf, "##META text cat ="))
|
||||
_ = w(fmt.Errorf("no cat: %s", url))
|
||||
p(fmt.Fprintln(&buf, "##META text tag ="))
|
||||
_ = w(fmt.Errorf("no tag: %s", url))
|
||||
// geen fout, maar waarschuwing als er meer fouten zijn
|
||||
fouten = append(fouten, fmt.Sprintf("no text: %s\n", url))
|
||||
// dus geen return false
|
||||
} else {
|
||||
for _, a := range aa {
|
||||
cat = strings.ReplaceAll(a.Content(), "\n", " ")
|
||||
p(fmt.Fprintf(&buf, "##META text cat = %s\n", fixSpace(cat)))
|
||||
tag = strings.ReplaceAll(a.Content(), "\n", " ")
|
||||
p(fmt.Fprintf(&buf, "##META text tag = %s\n", fixSpace(tag)))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -58,6 +58,9 @@ Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.
|
||||
cd xml
|
||||
alto -o $corpus.data.dz *.xml 2> /dev/null
|
||||
|
||||
# telling per bericht, niet per zin
|
||||
/net/corpora/nlnieuws/namen.sh -x 10 -s $corpus.data.dz > $corpus.tag.txt
|
||||
|
||||
cd ../..
|
||||
rm -fr out
|
||||
|
||||
|
||||
Reference in New Issue
Block a user