cats en tags

This commit is contained in:
Peter Kleiweg
2026-04-01 19:30:10 +02:00
parent 1fb1867550
commit e550b58889
14 changed files with 61 additions and 22 deletions

View File

@@ -58,6 +58,10 @@ Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.
cd xml cd xml
alto -o $corpus.data.dz *.xml 2> /dev/null alto -o $corpus.data.dz *.xml 2> /dev/null
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x 9 -s $corpus.data.dz > $corpus.cat.txt
/net/corpora/nlnieuws/namen.sh -x 10 -s $corpus.data.dz > $corpus.tag.txt
cd ../.. cd ../..
rm -fr out rm -fr out

View File

@@ -211,11 +211,11 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
return false return false
} }
cats := make([]string, 0) tags := make([]string, 0)
var buffer bytes.Buffer var buffer bytes.Buffer
for _, i := range doc.Graph { for _, i := range doc.Graph {
p(buffer.WriteString(html.UnescapeString(i.ArticleBody))) p(buffer.WriteString(html.UnescapeString(i.ArticleBody)))
cats = append(cats, i.ArticleSection...) tags = append(tags, i.ArticleSection...)
} }
text := buffer.String() text := buffer.String()
@@ -245,11 +245,11 @@ func doArticle(filename string, url string, timestamp time.Time, needUpdate bool
fp, err := os.Create(filename + ".txt") fp, err := os.Create(filename + ".txt")
p(err) p(err)
if len(cats) == 0 { if len(tags) == 0 {
p(fmt.Fprintln(fp, "##META text cat =")) p(fmt.Fprintln(fp, "##META text tag ="))
} else { } else {
for _, cat := range cats { for _, tag := range tags {
p(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat))) p(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(tag)))
} }
} }
p(fp.WriteString(text)) p(fp.WriteString(text))

View File

@@ -56,6 +56,9 @@ Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.
cd xml cd xml
alto -o $corpus.data.dz *.xml 2> /dev/null alto -o $corpus.data.dz *.xml 2> /dev/null
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x 10 -s $corpus.data.dz > $corpus.tag.txt
cd ../.. cd ../..
rm -fr out rm -fr out

View File

@@ -161,19 +161,19 @@ func doArticle(filename string, url string, title string, timestamp time.Time, n
root := doc.Root() root := doc.Root()
var cat string var tag string
aa, err := root.Search(`//a[contains(@class, "articleHeader__info__category")]`) aa, err := root.Search(`//a[contains(@class, "articleHeader__info__category")]`)
p(err) p(err)
if len(aa) == 0 { if len(aa) == 0 {
p(fmt.Fprintln(&buf, "##META text cat =")) p(fmt.Fprintln(&buf, "##META text tag ="))
_ = w(fmt.Errorf("no cat: %s", url)) _ = w(fmt.Errorf("no tag: %s", url))
// geen fout, maar waarschuwing als er meer fouten zijn // geen fout, maar waarschuwing als er meer fouten zijn
fouten = append(fouten, fmt.Sprintf("no text: %s\n", url)) fouten = append(fouten, fmt.Sprintf("no text: %s\n", url))
// dus geen return false // dus geen return false
} else { } else {
for _, a := range aa { for _, a := range aa {
cat = strings.ReplaceAll(a.Content(), "\n", " ") tag = strings.ReplaceAll(a.Content(), "\n", " ")
p(fmt.Fprintf(&buf, "##META text cat = %s\n", fixSpace(cat))) p(fmt.Fprintf(&buf, "##META text tag = %s\n", fixSpace(tag)))
} }
} }

View File

@@ -58,6 +58,9 @@ Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.
cd xml cd xml
alto -o $corpus.data.dz *.xml 2> /dev/null alto -o $corpus.data.dz *.xml 2> /dev/null
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x 10 -s $corpus.data.dz > $corpus.tag.txt
cd ../.. cd ../..
rm -fr out rm -fr out

View File

@@ -90,7 +90,7 @@ func main() {
fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt") fp, err := os.Create("out/" + filename[:len(filename)-4] + ".txt")
x(err) x(err)
for _, cat := range item.Cats { for _, cat := range item.Cats {
x(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat))) x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat)))
} }
x(fp.WriteString(text)) x(fp.WriteString(text))
x(fp.Close()) x(fp.Close())

View File

@@ -58,6 +58,9 @@ Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.
cd xml cd xml
alto -o $corpus.data.dz *.xml 2> /dev/null alto -o $corpus.data.dz *.xml 2> /dev/null
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x 10 -s $corpus.data.dz > $corpus.tag.txt
cd ../.. cd ../..
rm -fr out rm -fr out

View File

@@ -57,7 +57,7 @@ func main() {
var item Item var item Item
x(xml.Unmarshal(b, &item)) x(xml.Unmarshal(b, &item))
for _, cat := range item.Cats { for _, cat := range item.Cats {
x(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat))) x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat)))
} }
x(fp.WriteString(addEnd(fixSpace(item.Title)))) x(fp.WriteString(addEnd(fixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`)) doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))

View File

@@ -58,6 +58,9 @@ Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.
cd xml cd xml
alto -o $corpus.data.dz *.xml 2> /dev/null alto -o $corpus.data.dz *.xml 2> /dev/null
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x 10 -s $corpus.data.dz > $corpus.tag.txt
cd ../.. cd ../..
rm -fr out rm -fr out

View File

@@ -57,7 +57,7 @@ func main() {
var item Item var item Item
x(xml.Unmarshal(b, &item)) x(xml.Unmarshal(b, &item))
for _, cat := range item.Cats { for _, cat := range item.Cats {
x(fmt.Fprintf(fp, "##META text cat = %s\n", fixSpace(cat))) x(fmt.Fprintf(fp, "##META text tag = %s\n", fixSpace(cat)))
} }
x(fp.WriteString(addEnd(fixSpace(item.Title)))) x(fp.WriteString(addEnd(fixSpace(item.Title))))
doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`)) doc, err := gokogiri.ParseHtml([]byte(`<html><body>` + item.Text + `</body></html>`))

View File

@@ -58,6 +58,9 @@ Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.
cd xml cd xml
alto -o $corpus.data.dz *.xml 2> /dev/null alto -o $corpus.data.dz *.xml 2> /dev/null
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x 10 -s $corpus.data.dz > $corpus.tag.txt
cd ../.. cd ../..
rm -fr out rm -fr out

View File

@@ -151,7 +151,9 @@ func main() {
} }
} }
func doArticle(filename string, url string, title string, tags []string, labels []string, timestamp time.Time, needUpdate bool) bool { // Nstag -> tag
// Nslabeltag -> cat
func doArticle(filename string, url string, title string, tags []string, cats []string, timestamp time.Time, needUpdate bool) bool {
if exists(filename + ".skip") { if exists(filename + ".skip") {
return true return true
} }
@@ -236,18 +238,18 @@ func doArticle(filename string, url string, title string, tags []string, labels
} }
} }
if len(tags) == 0 { if len(cats) == 0 {
p(fmt.Fprintln(&buf, "##META text cat =")) p(fmt.Fprintln(&buf, "##META text cat ="))
} else { } else {
for _, tag := range tags { for _, cat := range cats {
p(fmt.Fprintf(&buf, "##META text cat = %s\n", fixSpace(tag))) p(fmt.Fprintf(&buf, "##META text cat = %s\n", fixSpace(cat)))
} }
} }
if len(labels) == 0 { if len(tags) == 0 {
p(fmt.Fprintln(&buf, "##META text label =")) p(fmt.Fprintln(&buf, "##META text tag ="))
} else { } else {
for _, label := range labels { for _, tag := range tags {
p(fmt.Fprintf(&buf, "##META text label = %s\n", fixSpace(label))) p(fmt.Fprintf(&buf, "##META text tag = %s\n", fixSpace(tag)))
} }
} }

View File

@@ -56,6 +56,10 @@ Alpino -flag treebank xml debug=1 end_hook=xml user_max=900000 -parse < $corpus.
cd xml cd xml
alto -o $corpus.data.dz *.xml 2> /dev/null alto -o $corpus.data.dz *.xml 2> /dev/null
# telling per bericht, niet per zin
/net/corpora/nlnieuws/namen.sh -x 9 -s $corpus.data.dz > $corpus.cat.txt
/net/corpora/nlnieuws/namen.sh -x 10 -s $corpus.data.dz > $corpus.tag.txt
cd ../.. cd ../..
rm -fr out rm -fr out

View File

@@ -20,6 +20,8 @@ gebruik:
6 : bestaande organisaties 6 : bestaande organisaties
7 : bestaande andere namen 7 : bestaande andere namen
8 : nieuwe adjectieven, deelwoorden en werkwoorden 8 : nieuwe adjectieven, deelwoorden en werkwoorden
9 : categorieën
10 : tags
-i : kies interactief -i : kies interactief
@@ -114,6 +116,18 @@ case $XN in
TEMPLATE='tt:%w\t%P' TEMPLATE='tt:%w\t%P'
XVALID=1 XVALID=1
;; ;;
9)
# categorieën
EXPR='fp://meta[@name="cat"]/@value'
TEMPLATE='tt:%m'
XVALID=1
;;
10)
# tags
EXPR='fp://meta[@name="tag"]/@value'
TEMPLATE='tt:%m'
XVALID=1
;;
'') '')
;; ;;
*) *)