# 1 Exercise 01

Generate a data frame abc that contains the letters from a to j in the first column and the integers from 10 to 1 in the second column. Make sure the first column is called LETTER and the second NUMBER.

# solution 1:
LETTER <-        # create a data structure called LETTER with
letters[1:10] # the first 10 elements of the inbuilt vector letters
NUMBER <- # create a data structure called NUMBER with
10:1   # with the integers from 10 to 1 in it
abc <- data.frame(        # create a data frame abc
LETTER,                # with LETTER as the 1st column
NUMBER,                # with NUMBER as the 1st column
stringsAsFactors=TRUE) # and make categorical variables factors (!)

# solution 2 (in one go, w/out creating LETTER & NUMBER separately first)
abc <- data.frame(        # create a data frame abc
LETTER=letters[1:10],  # with LETTER as the 1st column, w/ letters from a to j in there
NUMBER=10:1,           # with NUMBER as the 2nd column, w/ numbers from 10 to 1 in there
stringsAsFactors=TRUE) # and make categorical variables factors (!)

# 2 Exercise 02

Load the text file <inputfiles/201_03_dataframe1.csv> into a data frame example and check the loading was successful.

summary(example <- read.delim( # summarize the data structure example read
"inputfiles/201_03_dataframe1.csv", # from this file
stringsAsFactors=TRUE))     # w/ categorical variables as factors (!)
##       CASE       GRMRELATION     LENGTH       DEFINITENESS
##  Min.   : 1.00   obj :6      Min.   : 2.000   def  :6
##  1st Qu.: 3.75   subj:6      1st Qu.: 3.750   indef:6
##  Median : 6.50               Median : 7.000
##  Mean   : 6.50               Mean   : 6.417
##  3rd Qu.: 9.25               3rd Qu.: 9.000
##  Max.   :12.00               Max.   :10.000

# 3 Exercise 03

Extract from this data frame

• the second and third column
• the third and fourth row
# 1st question
example$GRMRELATION # column 2: individually ## [1] obj obj obj obj obj obj subj subj subj subj subj subj ## Levels: obj subj example$LENGTH      # column 3: individually
##  [1]  2  2 10  6  7  4  3  9  9  9  7  9
example[,2:3] # columns 2 & 3 jointly (way 1)
##    GRMRELATION LENGTH
## 1          obj      2
## 2          obj      2
## 3          obj     10
## 4          obj      6
## 5          obj      7
## 6          obj      4
## 7         subj      3
## 8         subj      9
## 9         subj      9
## 10        subj      9
## 11        subj      7
## 12        subj      9
example[,c("GRMRELATION", "LENGTH")] # columns 2 & 3 jointly (way 2)
##    GRMRELATION LENGTH
## 1          obj      2
## 2          obj      2
## 3          obj     10
## 4          obj      6
## 5          obj      7
## 6          obj      4
## 7         subj      3
## 8         subj      9
## 9         subj      9
## 10        subj      9
## 11        subj      7
## 12        subj      9
# 2nd question
example[3:4,]
##   CASE GRMRELATION LENGTH DEFINITENESS
## 3    3         obj     10          def
## 4    4         obj      6        indef

# 4 Exercise 04

Split the data frame example up according to the content of the second column (enter ?split at the R prompt for help).

# option 1
split(example,          # split up the data frame example
example$GRMRELATION) # depending on the values of the column GRMRELATION ##$obj
##   CASE GRMRELATION LENGTH DEFINITENESS
## 1    1         obj      2          def
## 2    2         obj      2          def
## 3    3         obj     10          def
## 4    4         obj      6        indef
## 5    5         obj      7        indef
## 6    6         obj      4        indef
##
## $subj ## CASE GRMRELATION LENGTH DEFINITENESS ## 7 7 subj 3 def ## 8 8 subj 9 def ## 9 9 subj 9 def ## 10 10 subj 9 indef ## 11 11 subj 7 indef ## 12 12 subj 9 indef # option 2 split(example, # split up the data frame example example[,"GRMRELATION"]) # depending on the values of the column GRMRELATION ##$obj
##   CASE GRMRELATION LENGTH DEFINITENESS
## 1    1         obj      2          def
## 2    2         obj      2          def
## 3    3         obj     10          def
## 4    4         obj      6        indef
## 5    5         obj      7        indef
## 6    6         obj      4        indef
##
## $subj ## CASE GRMRELATION LENGTH DEFINITENESS ## 7 7 subj 3 def ## 8 8 subj 9 def ## 9 9 subj 9 def ## 10 10 subj 9 indef ## 11 11 subj 7 indef ## 12 12 subj 9 indef # option 3 split(example, # split up the data frame example example[,2]) # depending on the values of the 2nd column ##$obj
##   CASE GRMRELATION LENGTH DEFINITENESS
## 1    1         obj      2          def
## 2    2         obj      2          def
## 3    3         obj     10          def
## 4    4         obj      6        indef
## 5    5         obj      7        indef
## 6    6         obj      4        indef
##
## $subj ## CASE GRMRELATION LENGTH DEFINITENESS ## 7 7 subj 3 def ## 8 8 subj 9 def ## 9 9 subj 9 def ## 10 10 subj 9 indef ## 11 11 subj 7 indef ## 12 12 subj 9 indef Here are separate manual alternatives: subset(example, # show a subset of the data frame example, namely example$GRMRELATION=="obj")  # when GRMRELATION is "obj"
##   CASE GRMRELATION LENGTH DEFINITENESS
## 1    1         obj      2          def
## 2    2         obj      2          def
## 3    3         obj     10          def
## 4    4         obj      6        indef
## 5    5         obj      7        indef
## 6    6         obj      4        indef
subset(example, # show a subset of the data frame example, namely
example$GRMRELATION=="subj") # when GRMRELATION is "subj" ## CASE GRMRELATION LENGTH DEFINITENESS ## 7 7 subj 3 def ## 8 8 subj 9 def ## 9 9 subj 9 def ## 10 10 subj 9 indef ## 11 11 subj 7 indef ## 12 12 subj 9 indef # 5 Exercise 05 Change the value at the intersection of the third row and the fourth column into “indef” and save the changed data frame into <inputfiles/201_03_dataframe2.csv> such that you can easily load/edit in a spreadsheet software. example[3,4] <- "indef" write.table( # write the data frame example, # example sep="\t", eol="\n", # with tabs between columns & line breaks row.names=FALSE, quote=FALSE, # no row names & quotes (for factors) file="inputfiles/201_03_dataframe2.csv") # into this file # 6 Exercise 06 Generate the following data frame and call it EPP (for English personal pronouns) […] PRONOUN PERSON NUMBER I 1 sg you 2 sg he 3 sg she 3 sg it 3 sg we 1 pl you 2 pl they 3 pl (EPP <- data.frame( PRONOUN=c("I", "you", "he", "she", "it", "we", "you", "they"), PERSON=c(1, 2, 3, 3, 3, 1, 2, 3), NUMBER=rep(c("sg", "pl"), c(5, 3)), stringsAsFactors=TRUE)) ## PRONOUN PERSON NUMBER ## 1 I 1 sg ## 2 you 2 sg ## 3 he 3 sg ## 4 she 3 sg ## 5 it 3 sg ## 6 we 1 pl ## 7 you 2 pl ## 8 they 3 pl # 7 Exercise 07 Extract from this data frame EPP • the value of the fourth row and the second column; • the values of the third to fourth rows and the first to second columns; • the rows that have plural pronouns in them; • the rows with first and third person pronouns. # 1st question EPP[4,2] ## [1] 3 # 2nd question EPP[3:4,1:2] ## PRONOUN PERSON ## 3 he 3 ## 4 she 3 # 3rd question EPP[EPP$NUMBER=="pl",] # or
##   PRONOUN PERSON NUMBER
## 6      we      1     pl
## 7     you      2     pl
## 8    they      3     pl
subset(              # show a subset
EPP,              # of the data frame EPP, namely
EPP$NUMBER=="pl") # when NUMBER is "pl" ## PRONOUN PERSON NUMBER ## 6 we 1 pl ## 7 you 2 pl ## 8 they 3 pl # 4th question EPP[(EPP$PERSON==1    # of EPP, the rows when PERSON is 1
|                # or
EPP$PERSON==3),] # when PERSON is 3 ## PRONOUN PERSON NUMBER ## 1 I 1 sg ## 3 he 3 sg ## 4 she 3 sg ## 5 it 3 sg ## 6 we 1 pl ## 8 they 3 pl EPP[EPP$PERSON!=2,] # of EPP, the rows when PERSON is not 2
##   PRONOUN PERSON NUMBER
## 1       I      1     sg
## 3      he      3     sg
## 4     she      3     sg
## 5      it      3     sg
## 6      we      1     pl
## 8    they      3     pl
subset(  # show a subset
EPP,  # of the data frame EPP, namely
EPP$PERSON %in% c(1, 3)) # when PERSON is in the set 1, 3 ## PRONOUN PERSON NUMBER ## 1 I 1 sg ## 3 he 3 sg ## 4 she 3 sg ## 5 it 3 sg ## 6 we 1 pl ## 8 they 3 pl # 8 Exercise 08 Generate a vector FREQS of the frequencies with which the personal pronouns in EPP occurred in a small corpus: I: 8426, you: 9462, he: 6394, she: 4234, it: 6040, we: 2305, you: 8078, they: 2998. Then, make this vector the fourth column of EPP. EPP$FREQS <- c(8426, 9462, 6394, 4234, 6040, 2305, 8078, 2998) # or
EPP[,"FREQS"] <- c(8426, 9462, 6394, 4234, 6040, 2305, 8078, 2998)

# 9 Exercise 09

Save the data frame EPP into <inputfiles/201_03_dataframe3.csv> such that you can easily load/edit in a spreadsheet software.

write.table(EPP,                 # write the data frame EPP
sep="\t", eol="\n",           # with tabs between columns & line breaks
row.names=FALSE, quote=FALSE, # no row names & quotes (for factors)
file="inputfiles/201_03_dataframe3.csv") # into this file

# 10 Exercise 10

The file <inputfiles/201_03_dataframe4.csv> contains data for the VERB into VERBing construction in the BNC (e.g., He [V1 forced] him into [V2 speaking] about it). For each instance of one such construction, the file contains

• the column BNC: the file where the instance was found (e.g., A06 in row 1);
• the column VERB_LEMMA: the lemma of the finite verb (e.g., force in row 1);
• the column ING_FORM: the gerund (e.g., speaking in row 1)
• the column ING_TAG: the part-of-speech tag of the gerund (e.g., VVG in row 1);
• the column ING_LEMMA: the lemma of the gerund (e.g., speak in row 1).

Load this file into a data frame x and check the loading was successful.

summary(x <- read.delim(   # summarize the data structure x imported
"inputfiles/201_03_dataframe4.csv", # from this file
stringsAsFactors=TRUE)) # and make categorical variables factors (!)
##       BNC         VERB_LEMMA        ING_FORM       ING_TAG       ING_LEMMA
##  HH3    :  11   force  : 101   thinking : 146   VVG    :1239   think  : 147
##  K5D    :  11   trick  :  92   believing: 104   NN1-VVG: 158   believe: 104
##  CBG    :  10   fool   :  77   making   :  62   AJ0-VVG: 108   make   :  62
##  EUU    :  10   talk   :  62   giving   :  54   VDG    :  49   give   :  54
##  HGM    :  10   mislead:  57   accepting:  51   VBG    :  23   accept :  51
##  HXE    :  10   coerce :  52   doing    :  49   VHG    :  15   do     :  49
##  (Other):1538   (Other):1159   (Other)  :1134   (Other):   8   (Other):1133
head(x) # look at the first 6 rows
##   BNC VERB_LEMMA ING_FORM ING_TAG ING_LEMMA
## 1 A06      force speaking     VVG     speak
## 2 A08      nudge    being     VBG        be
## 3 A0C       talk   taking     VVG      take
## 4 A0F      bully   taking     VVG      take
## 5 A0H  influence   trying     VVG       try
## 6 A0H     delude thinking     VVG     think

# 11 Exercise 11

• What is the quickest way of identifying the numbers of verb lemma types and -ing lemma types? (don’t just use str!)
• What is the most frequent verb lemma? (don’t just use summary!)
• What is the most frequent -ing lemma with this verb lemma?
# 1st question
str(x) # or
## 'data.frame':    1600 obs. of  5 variables:
##  $BNC : Factor w/ 929 levels "A06","A08","A0C",..: 1 2 3 4 5 5 6 6 7 8 ... ##$ VERB_LEMMA: Factor w/ 208 levels "activate","aggravate",..: 76 126 186 26 96 51 75 149 152 186 ...
##  $ING_FORM : Factor w/ 422 levels "abandoning","abdicating",..: 354 49 382 382 395 387 387 133 209 175 ... ##$ ING_TAG   : Factor w/ 10 levels "AJ0-NN1","AJ0-VVG",..: 10 7 10 10 10 10 10 1 10 10 ...
##  $ING_LEMMA : Factor w/ 416 levels "abandon","abdicate",..: 349 41 377 377 389 382 382 377 207 173 ... length( # how many unique(x$VERB_LEMMA)) # different types/levels of VERB_LEMMA are there?
## [1] 208
length(                # how many
unique(x$ING_LEMMA)) # unique types/levels of ING_LEMMA are there? ## [1] 416 # 2nd question names(tail(sort( # show the name of the tail of the sorted table(x$VERB_LEMMA)), # frequency table of VERB_LEMMA
1))                      # namely the last item
## [1] "force"
# 3rd question
names(tail(sort(              # the name of the end of the sorted
table(x$ING_LEMMA[ # frequency table of ING_LEMMA where x$VERB_LEMMA=="force"]) # VERB_LEMMA is "force"
), 1)) # end of sort(), then end of tail(), then end of names()
## [1] "make"

Here’s a great alternative for the 2nd question using the pipe (%>%) from the package magrittr:

library(magrittr)
x$VERB_LEMMA %>% table %>% sort %>% tail(1) %>% names ## [1] "force" # 12 Exercise 12 • Delete the column with the corpus files; the new data frame is to be called xx; • delete the rows with the four rarest tags; the new data frame is to be called xxx; • from xxx, create a new data frame xxxx which is sorted according to • the column VERB_LEMMA (ascending) and, within that, • the column the ING_LEMMA (descending); • save the changed data frame into a tab-delimited text file <inputfiles/201_03_dataframe5.csv> such that you can easily load/edit in a spreadsheet software. xx <- x[,2:5] # step 1: determine the rarest tags sort( # sort table(xx$ING_TAG)) # the frequency table of ING_TAG (note the COV.2$) ## ## AJ0-NN1 CJS UNC NN1 VHG VBG VDG AJ0-VVG NN1-VVG VVG ## 1 1 2 4 15 23 49 108 158 1239 # step 2: determine the vector of deletees deletees <- which( # the deletees are where xx$ING_TAG=="AJ0-NN1" | # xx$ING_TAG is "AJ0-NN1" or where xx$ING_TAG=="CJS" |     # xx$ING_TAG is "CJS" or where xx$ING_TAG=="UNC" |     # xx$ING_TAG is "UNC" or where xx$ING_TAG=="NN1")      # xx$ING_TAG is "NN1" # step 3: delete xxx <- xx[-deletees,] order.index <- order( # make order.index the order xxx$VERB_LEMMA,        # to sort by xxx$VERB_LEMMA, then break ties by -rank(xxx$ING_LEMMA))  # (reversed) xxx$ING_LEMMA xxxx <- xxx[order.index,] # make xxxx the re-sorted data frame write.table(xxxx, # write the data frame xxxx sep="\t", eol="\n", # with tabs between columns & line breaks row.names=FALSE, quote=FALSE, # no row names & quotes (for factors) file="inputfiles/201_03_dataframe5.csv") # into this file # A more advanced alternative for the second task: xxx <- xx[(!(xx$TAG_ING %in% names(sort(table(xx\$TAG_ING)))[1:4])),]
sessionInfo()
## R version 4.2.2 Patched (2022-11-10 r83330)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Pop!_OS 22.04 LTS
##
## Matrix products: default
##
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   compiler
## [8] base
##
## other attached packages:
## [1] magrittr_2.0.3
##
## loaded via a namespace (and not attached):
##  [1] digest_0.6.31   R6_2.5.1        lifecycle_1.0.3 jsonlite_1.8.4
##  [5] evaluate_0.19   stringi_1.7.12  cachem_1.0.6    rlang_1.0.6
##  [9] cli_3.6.0       rstudioapi_0.14 jquerylib_0.1.4 bslib_0.4.2
## [13] vctrs_0.5.1     rmarkdown_2.19  tools_4.2.2     stringr_1.5.0
## [17] glue_1.6.2      xfun_0.36       yaml_2.3.6      fastmap_1.1.0
## [21] htmltools_0.5.4 knitr_1.41      sass_0.4.4

# 13 Homework

To prepare for next week, read (and work through!) SFLWR3: Sections 3.1 and 3.5.3.