Kapitel 11 Konnektoren
11.1 Pakete
library(tidyverse)
library(scales)
library(janitor)
library(readtext)
library(quanteda)
library(quanteda.textmodels)
library(quanteda.textstats)
library(quanteda.textplots)
library(tidytext)
library(readxl)
library(writexl)
library(udpipe)
11.2 Text einlesen
= readtext("data/books/*.txt", encoding = "UTF-8")
txt txt
## readtext object consisting of 2 documents and 0 docvars.
## # Description: df [2 x 2]
## doc_id text
## <chr> <chr>
## 1 prozess.txt "\"Der Prozes\"..."
## 2 tom.txt "\"Tom Sawyer\"..."
11.3 UDPipe laden
library(udpipe)
= "german-gsd-ud-2.5-191206.udpipe"
destfile
if(!file.exists(destfile)){
<- udpipe_download_model(language = "german")
sprachmodell <- udpipe_load_model(sprachmodell$file_model)
udmodel_de else {
} = destfile
file_model <- udpipe_load_model(file_model)
udmodel_de }
11.4 Text annotieren
https://universaldependencies.org/
# Na začetku je readtext prebral besedila, shranili smo jih v spremenljivki "txt".
<- udpipe_annotate(udmodel_de, x = txt$text, trace = TRUE) x
## 2022-03-22 22:01:39 Annotating text fragment 1/2
## 2022-03-22 22:03:32 Annotating text fragment 2/2
# # samo prvo besedilo:
# x <- udpipe_annotate(udmodel_de, x = txt$text[1], trace = TRUE)
<- as.data.frame(x) x
# write_rds(x, "data/prozess_tom_udpiped.rds")
# x = read_rds("data/prozess_tom_udpiped.rds")
11.5 Wortklassen und Konnektoren
= x %>%
tabela group_by(doc_id) %>%
count(upos) %>%
filter(!is.na(upos),
!= "PUNCT")
upos head(tabela) %>% rmarkdown::paged_table()
= tabela %>%
vezniki filter(upos %in% c("CCONJ", "SCONJ")) %>%
mutate(prozent = n/sum(n)) %>%
pivot_wider(id_cols = upos,
names_from = doc_id, values_from = n:prozent)
head(vezniki) %>% rmarkdown::paged_table()
= x %>%
konnektoren mutate(token = str_to_lower(token)) %>%
group_by(doc_id, token) %>%
filter(!is.na(upos),
!= "PUNCT") %>%
upos filter(upos %in% c("CCONJ", "SCONJ")) %>%
count(upos, sort = T)
= x %>%
connectors mutate(token = str_to_lower(token)) %>%
group_by(token) %>%
filter(!is.na(upos),
!= "PUNCT") %>%
upos filter(upos %in% c("CCONJ", "SCONJ")) %>%
count(upos, sort = T)
%>% filter(upos == "CCONJ") %>% pull(token) %>% head(50) connectors
## [1] "und" "aber" "oder"
## [4] "denn" "sondern" "wie"
## [7] "als" "weder" "doch"
## [10] "noch" "schrie" "kroch"
## [13] "desto" "um" "woher"
## [16] "du" "entweder" "hatte"
## [19] "sowie" "statt" ",aber"
## [22] ",und" "sowohl" "irgendwie"
## [25] "unnötigerweise" ",denn" ",nun"
## [28] "aschfahl" "aß" "ausnahmsweise"
## [31] "besinn" "brauch" "daß"
## [34] "dazu" "dennoch" "genau"
## [37] "hoch" "insbesondere" "kund"
## [40] "laß" "manch" "ob"
## [43] "sinn" "stahl" "such"
## [46] "unvorsichtigerweise" "verzeih" "wieder"
## [49] "wozu"
%>% filter(upos == "SCONJ") %>% pull(token) %>% head(50) connectors
## [1] "daß" "wenn" "als" "wie" "da"
## [6] "denn" "während" "ob" "bis" "weil"
## [11] "obwohl" "indem" "laß" "nachdem" "damit"
## [16] "sobald" "ehe" "ohne" "solange" "soweit"
## [21] "bevor" "das" "aber" "seit" "seitdem"
## [26] "strich" "gleich" "begann" "falls" "vergaß"
## [31] "worum" "halb" "hätt" "maß" "sehe"
## [36] "statt" "warum" "wohl" ",das" ",denn"
## [41] ",ich" "befahl" "bestrich" "dann" "dasaß"
## [46] "dass" "fern" "fortwährend" "fühl" "gebührend"
Wir können auch Listen mit neben- und unterordnenden Konjunktionen sowie Konjunktionaladverbien aus dem Internet abrufen. Dann können wir sie genauer zählen.
case_when str_detect(seznam_prirednih_konektorjev, “…|…|…”) str_detect(seznam_podrednih_konektorjev, “…|…|…”) str_detect(seznam_prislovnih_konektorjev, “…|…|…”)