diff --git a/NOS/txt2corpus.sh b/NOS/txt2corpus.sh index 4fbad73..e9e2983 100755 --- a/NOS/txt2corpus.sh +++ b/NOS/txt2corpus.sh @@ -58,6 +58,10 @@ Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus. cd xml alto -o $corpus.data.dz *.xml 2> /dev/null +# telling per bericht, niet per zin +/net/corpora/nlnieuws/namen.sh -x 9 -s $corpus.data.dz > $corpus.cat.txt +/net/corpora/nlnieuws/namen.sh -x 10 -s $corpus.data.dz > $corpus.tag.txt + cd ../.. rm -fr out diff --git a/NU/cmd/nu/nu.go b/NU/cmd/nu/nu.go index 597ac0b..aa2d28f 100644 --- a/NU/cmd/nu/nu.go +++ b/NU/cmd/nu/nu.go @@ -211,11 +211,11 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool return false } - cats := make([]string, 0) + tags := make([]string, 0) var buffer bytes.Buffer for _, i := range doc.Graph { p(buffer.WriteString(html.UnescapeString(i.ArticleBody))) - cats = append(cats, i.ArticleSection...) + tags = append(tags, i.ArticleSection...) } text := buffer.String() @@ -245,11 +245,11 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool fp, err := os.Create(filename + ".txt") p(err) - if len(cats) == 0 { - p(fmt.Fprintln(fp, "##META text cat =")) + if len(tags) == 0 { + p(fmt.Fprintln(fp, "##META text tag =")) } else { - for _, cat := range cats { - p(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat))) + for _, tag := range tags { + p(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag))) } } p(fp.WriteString(text)) diff --git a/NU/txt2corpus.sh b/NU/txt2corpus.sh index d3051fc..9c9e538 100755 --- a/NU/txt2corpus.sh +++ b/NU/txt2corpus.sh @@ -56,6 +56,9 @@ Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus. cd xml alto -o $corpus.data.dz *.xml 2> /dev/null +# telling per bericht, niet per zin +/net/corpora/nlnieuws/namen.sh -x 10 -s $corpus.data.dz > $corpus.tag.txt + cd ../.. rm -fr out diff --git a/NieuwsNL/cmd/nieuwsnl/nieuwsnl.go b/NieuwsNL/cmd/nieuwsnl/nieuwsnl.go index 1011a50..bdcb739 100644 --- a/NieuwsNL/cmd/nieuwsnl/nieuwsnl.go +++ b/NieuwsNL/cmd/nieuwsnl/nieuwsnl.go @@ -161,19 +161,19 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n root := doc.Root() - var cat string + var tag string aa, err := root.Search(`//a[contains(@class, "articleHeader__info__category")]`) p(err) if len(aa) == 0 { - p(fmt.Fprintln(&buf, "##META text cat =")) - _ = w(fmt.Errorf("no cat: %s", url)) + p(fmt.Fprintln(&buf, "##META text tag =")) + _ = w(fmt.Errorf("no tag: %s", url)) // geen fout, maar waarschuwing als er meer fouten zijn fouten = append(fouten, fmt.Sprintf("no text: %s\n", url)) // dus geen return false } else { for _, a := range aa { - cat = strings.ReplaceAll(a.Content(), "\n", " ") - p(fmt.Fprintf(&buf, "##META text cat = %s\n", fixSpace(cat))) + tag = strings.ReplaceAll(a.Content(), "\n", " ") + p(fmt.Fprintf(&buf, "##META text tag = %s\n", fixSpace(tag))) } } diff --git a/NieuwsNL/txt2corpus.sh b/NieuwsNL/txt2corpus.sh index eda4ea1..99ce671 100755 --- a/NieuwsNL/txt2corpus.sh +++ b/NieuwsNL/txt2corpus.sh @@ -58,6 +58,9 @@ Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus. cd xml alto -o $corpus.data.dz *.xml 2> /dev/null +# telling per bericht, niet per zin +/net/corpora/nlnieuws/namen.sh -x 10 -s $corpus.data.dz > $corpus.tag.txt + cd ../.. rm -fr out diff --git a/RO/cmd/xml2txt/xml2txt.go b/RO/cmd/xml2txt/xml2txt.go index ad7f1a3..8ce8a5d 100644 --- a/RO/cmd/xml2txt/xml2txt.go +++ b/RO/cmd/xml2txt/xml2txt.go @@ -90,7 +90,7 @@ func main() { fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt") x(err) for _, cat := range item.Cats { - x(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat))) + x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat))) } x(fp.WriteString(text)) x(fp.Close()) diff --git a/RO/txt2corpus.sh b/RO/txt2corpus.sh index 15859d5..cb4c91c 100755 --- a/RO/txt2corpus.sh +++ b/RO/txt2corpus.sh @@ -58,6 +58,9 @@ Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus. cd xml alto -o $corpus.data.dz *.xml 2> /dev/null +# telling per bericht, niet per zin +/net/corpora/nlnieuws/namen.sh -x 10 -s $corpus.data.dz > $corpus.tag.txt + cd ../.. rm -fr out diff --git a/Sargasso/cmd/xml2txt/xml2txt.go b/Sargasso/cmd/xml2txt/xml2txt.go index be6f9ed..f04246a 100644 --- a/Sargasso/cmd/xml2txt/xml2txt.go +++ b/Sargasso/cmd/xml2txt/xml2txt.go @@ -57,7 +57,7 @@ func main() { var item Item x(xml.Unmarshal(b, &item)) for _, cat := range item.Cats { - x(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat))) + x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat))) } x(fp.WriteString(addEnd(fixSpace(item.Title)))) doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``)) diff --git a/Sargasso/txt2corpus.sh b/Sargasso/txt2corpus.sh index 4ae2b31..eb84a1f 100755 --- a/Sargasso/txt2corpus.sh +++ b/Sargasso/txt2corpus.sh @@ -58,6 +58,9 @@ Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus. cd xml alto -o $corpus.data.dz *.xml 2> /dev/null +# telling per bericht, niet per zin +/net/corpora/nlnieuws/namen.sh -x 10 -s $corpus.data.dz > $corpus.tag.txt + cd ../.. rm -fr out diff --git a/Tzum/cmd/xml2txt/xml2txt.go b/Tzum/cmd/xml2txt/xml2txt.go index 87a3bd2..97c0e21 100644 --- a/Tzum/cmd/xml2txt/xml2txt.go +++ b/Tzum/cmd/xml2txt/xml2txt.go @@ -57,7 +57,7 @@ func main() { var item Item x(xml.Unmarshal(b, &item)) for _, cat := range item.Cats { - x(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat))) + x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat))) } x(fp.WriteString(addEnd(fixSpace(item.Title)))) doc, err := gokogiri.ParseHtml([]byte(`` + item.Text + ``)) diff --git a/Tzum/txt2corpus.sh b/Tzum/txt2corpus.sh index 280fabf..0ae1fb6 100755 --- a/Tzum/txt2corpus.sh +++ b/Tzum/txt2corpus.sh @@ -58,6 +58,9 @@ Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus. cd xml alto -o $corpus.data.dz *.xml 2> /dev/null +# telling per bericht, niet per zin +/net/corpora/nlnieuws/namen.sh -x 10 -s $corpus.data.dz > $corpus.tag.txt + cd ../.. rm -fr out diff --git a/VRT/cmd/vrt/vrt.go b/VRT/cmd/vrt/vrt.go index 3aa384a..4657327 100644 --- a/VRT/cmd/vrt/vrt.go +++ b/VRT/cmd/vrt/vrt.go @@ -151,7 +151,9 @@ func main() { } } -func doArticle(filename string, url string, title string, tags []string, labels []string, timestamp time.Time, needUpdate bool) bool { +// Nstag -> tag +// Nslabeltag -> cat +func doArticle(filename string, url string, title string, tags []string, cats []string, timestamp time.Time, needUpdate bool) bool { if exists(filename + ".skip") { return true } @@ -236,18 +238,18 @@ func doArticle(filename string, url string, title string, tags []string, labels } } - if len(tags) == 0 { + if len(cats) == 0 { p(fmt.Fprintln(&buf, "##META text cat =")) } else { - for _, tag := range tags { - p(fmt.Fprintf(&buf, "##META text cat = %s\n", fixSpace(tag))) + for _, cat := range cats { + p(fmt.Fprintf(&buf, "##META text cat = %s\n", fixSpace(cat))) } } - if len(labels) == 0 { - p(fmt.Fprintln(&buf, "##META text label =")) + if len(tags) == 0 { + p(fmt.Fprintln(&buf, "##META text tag =")) } else { - for _, label := range labels { - p(fmt.Fprintf(&buf, "##META text label = %s\n", fixSpace(label))) + for _, tag := range tags { + p(fmt.Fprintf(&buf, "##META text tag = %s\n", fixSpace(tag))) } } diff --git a/VRT/txt2corpus.sh b/VRT/txt2corpus.sh index 02b1d3e..faefee6 100755 --- a/VRT/txt2corpus.sh +++ b/VRT/txt2corpus.sh @@ -56,6 +56,10 @@ Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus. cd xml alto -o $corpus.data.dz *.xml 2> /dev/null +# telling per bericht, niet per zin +/net/corpora/nlnieuws/namen.sh -x 9 -s $corpus.data.dz > $corpus.cat.txt +/net/corpora/nlnieuws/namen.sh -x 10 -s $corpus.data.dz > $corpus.tag.txt + cd ../.. rm -fr out diff --git a/namen.sh b/namen.sh index fbb94ea..1eaf408 100755 --- a/namen.sh +++ b/namen.sh @@ -20,6 +20,8 @@ gebruik: 6 : bestaande organisaties 7 : bestaande andere namen 8 : nieuwe adjectieven, deelwoorden en werkwoorden + 9 : categorieën + 10 : tags -i : kies interactief @@ -114,6 +116,18 @@ case $XN in TEMPLATE='tt:%w\t%P' XVALID=1 ;; + 9) + # categorieën + EXPR='fp://meta[@name="cat"]/@value' + TEMPLATE='tt:%m' + XVALID=1 + ;; + 10) + # tags + EXPR='fp://meta[@name="tag"]/@value' + TEMPLATE='tt:%m' + XVALID=1 + ;; '') ;; *)