1 Exercise 01

Generate a data frame abc that contains the letters from a to j in the first column and the integers from 10 to 1 in the second column. Make sure the first column is called LETTER and the second NUMBER.

# solution 1:
LETTER <-        # create a data structure called LETTER with
   letters[1:10] # the first 10 elements of the inbuilt vector letters
NUMBER <- # create a data structure called NUMBER with
   10:1   # with the integers from 10 to 1 in it
abc <- data.frame(        # create a data frame abc
   LETTER,                # with LETTER as the 1st column
   NUMBER,                # with NUMBER as the 1st column
   stringsAsFactors=TRUE) # and make categorical variables factors (!)

# solution 2 (in one go, w/out creating LETTER & NUMBER separately first)
abc <- data.frame(        # create a data frame abc
   LETTER=letters[1:10],  # with LETTER as the 1st column, w/ letters from a to j in there
   NUMBER=10:1,           # with NUMBER as the 2nd column, w/ numbers from 10 to 1 in there
   stringsAsFactors=TRUE) # and make categorical variables factors (!)

2 Exercise 02

Load the text file <inputfiles/201_03_dataframe1.csv> into a data frame example and check the loading was successful.

summary(example <- read.delim( # summarize the data structure example read
   "inputfiles/201_03_dataframe1.csv", # from this file
   stringsAsFactors=TRUE))     # w/ categorical variables as factors (!)
##       CASE       GRMRELATION     LENGTH       DEFINITENESS
##  Min.   : 1.00   obj :6      Min.   : 2.000   def  :6     
##  1st Qu.: 3.75   subj:6      1st Qu.: 3.750   indef:6     
##  Median : 6.50               Median : 7.000               
##  Mean   : 6.50               Mean   : 6.417               
##  3rd Qu.: 9.25               3rd Qu.: 9.000               
##  Max.   :12.00               Max.   :10.000

3 Exercise 03

Extract from this data frame

  • the second and third column
  • the third and fourth row
# 1st question
example$GRMRELATION # column 2: individually
##  [1] obj  obj  obj  obj  obj  obj  subj subj subj subj subj subj
## Levels: obj subj
example$LENGTH      # column 3: individually
##  [1]  2  2 10  6  7  4  3  9  9  9  7  9
example[,2:3] # columns 2 & 3 jointly (way 1)
##    GRMRELATION LENGTH
## 1          obj      2
## 2          obj      2
## 3          obj     10
## 4          obj      6
## 5          obj      7
## 6          obj      4
## 7         subj      3
## 8         subj      9
## 9         subj      9
## 10        subj      9
## 11        subj      7
## 12        subj      9
example[,c("GRMRELATION", "LENGTH")] # columns 2 & 3 jointly (way 2)
##    GRMRELATION LENGTH
## 1          obj      2
## 2          obj      2
## 3          obj     10
## 4          obj      6
## 5          obj      7
## 6          obj      4
## 7         subj      3
## 8         subj      9
## 9         subj      9
## 10        subj      9
## 11        subj      7
## 12        subj      9
# 2nd question
example[3:4,]
##   CASE GRMRELATION LENGTH DEFINITENESS
## 3    3         obj     10          def
## 4    4         obj      6        indef

4 Exercise 04

Split the data frame example up according to the content of the second column (enter ?split at the R prompt for help).

# option 1
split(example,          # split up the data frame example
   example$GRMRELATION) # depending on the values of the column GRMRELATION
## $obj
##   CASE GRMRELATION LENGTH DEFINITENESS
## 1    1         obj      2          def
## 2    2         obj      2          def
## 3    3         obj     10          def
## 4    4         obj      6        indef
## 5    5         obj      7        indef
## 6    6         obj      4        indef
## 
## $subj
##    CASE GRMRELATION LENGTH DEFINITENESS
## 7     7        subj      3          def
## 8     8        subj      9          def
## 9     9        subj      9          def
## 10   10        subj      9        indef
## 11   11        subj      7        indef
## 12   12        subj      9        indef
# option 2
split(example,              # split up the data frame example
   example[,"GRMRELATION"]) # depending on the values of the column GRMRELATION
## $obj
##   CASE GRMRELATION LENGTH DEFINITENESS
## 1    1         obj      2          def
## 2    2         obj      2          def
## 3    3         obj     10          def
## 4    4         obj      6        indef
## 5    5         obj      7        indef
## 6    6         obj      4        indef
## 
## $subj
##    CASE GRMRELATION LENGTH DEFINITENESS
## 7     7        subj      3          def
## 8     8        subj      9          def
## 9     9        subj      9          def
## 10   10        subj      9        indef
## 11   11        subj      7        indef
## 12   12        subj      9        indef
# option 3
split(example,  # split up the data frame example
   example[,2]) # depending on the values of the 2nd column
## $obj
##   CASE GRMRELATION LENGTH DEFINITENESS
## 1    1         obj      2          def
## 2    2         obj      2          def
## 3    3         obj     10          def
## 4    4         obj      6        indef
## 5    5         obj      7        indef
## 6    6         obj      4        indef
## 
## $subj
##    CASE GRMRELATION LENGTH DEFINITENESS
## 7     7        subj      3          def
## 8     8        subj      9          def
## 9     9        subj      9          def
## 10   10        subj      9        indef
## 11   11        subj      7        indef
## 12   12        subj      9        indef

Here are separate manual alternatives:

subset(example, # show a subset of the data frame example, namely
   example$GRMRELATION=="obj")  # when GRMRELATION is "obj"
##   CASE GRMRELATION LENGTH DEFINITENESS
## 1    1         obj      2          def
## 2    2         obj      2          def
## 3    3         obj     10          def
## 4    4         obj      6        indef
## 5    5         obj      7        indef
## 6    6         obj      4        indef
subset(example, # show a subset of the data frame example, namely
   example$GRMRELATION=="subj") # when GRMRELATION is "subj"
##    CASE GRMRELATION LENGTH DEFINITENESS
## 7     7        subj      3          def
## 8     8        subj      9          def
## 9     9        subj      9          def
## 10   10        subj      9        indef
## 11   11        subj      7        indef
## 12   12        subj      9        indef

5 Exercise 05

Change the value at the intersection of the third row and the fourth column into “indef” and save the changed data frame into <inputfiles/201_03_dataframe2.csv> such that you can easily load/edit in a spreadsheet software.

example[3,4] <- "indef"
write.table(                     # write the data frame
   example,                      # example
   sep="\t", eol="\n",           # with tabs between columns & line breaks
   row.names=FALSE, quote=FALSE, # no row names & quotes (for factors)
   file="inputfiles/201_03_dataframe2.csv") # into this file

6 Exercise 06

Generate the following data frame and call it EPP (for English personal pronouns) […]

PRONOUN PERSON NUMBER
I 1 sg
you 2 sg
he 3 sg
she 3 sg
it 3 sg
we 1 pl
you 2 pl
they 3 pl
(EPP <- data.frame(
   PRONOUN=c("I", "you", "he", "she", "it", "we", "you", "they"),
   PERSON=c(1, 2, 3, 3, 3, 1, 2, 3),
   NUMBER=rep(c("sg", "pl"), c(5, 3)),
   stringsAsFactors=TRUE))
##   PRONOUN PERSON NUMBER
## 1       I      1     sg
## 2     you      2     sg
## 3      he      3     sg
## 4     she      3     sg
## 5      it      3     sg
## 6      we      1     pl
## 7     you      2     pl
## 8    they      3     pl

7 Exercise 07

Extract from this data frame EPP

  • the value of the fourth row and the second column;
  • the values of the third to fourth rows and the first to second columns;
  • the rows that have plural pronouns in them;
  • the rows with first and third person pronouns.
# 1st question
EPP[4,2]
## [1] 3
# 2nd question
EPP[3:4,1:2]
##   PRONOUN PERSON
## 3      he      3
## 4     she      3
# 3rd question
EPP[EPP$NUMBER=="pl",] # or
##   PRONOUN PERSON NUMBER
## 6      we      1     pl
## 7     you      2     pl
## 8    they      3     pl
subset(              # show a subset
   EPP,              # of the data frame EPP, namely
   EPP$NUMBER=="pl") # when NUMBER is "pl"
##   PRONOUN PERSON NUMBER
## 6      we      1     pl
## 7     you      2     pl
## 8    they      3     pl
# 4th question
EPP[(EPP$PERSON==1    # of EPP, the rows when PERSON is 1
     |                # or
     EPP$PERSON==3),] # when PERSON is 3
##   PRONOUN PERSON NUMBER
## 1       I      1     sg
## 3      he      3     sg
## 4     she      3     sg
## 5      it      3     sg
## 6      we      1     pl
## 8    they      3     pl
EPP[EPP$PERSON!=2,] # of EPP, the rows when PERSON is not 2
##   PRONOUN PERSON NUMBER
## 1       I      1     sg
## 3      he      3     sg
## 4     she      3     sg
## 5      it      3     sg
## 6      we      1     pl
## 8    they      3     pl
subset(  # show a subset
   EPP,  # of the data frame EPP, namely
   EPP$PERSON %in% c(1, 3)) # when PERSON is in the set 1, 3
##   PRONOUN PERSON NUMBER
## 1       I      1     sg
## 3      he      3     sg
## 4     she      3     sg
## 5      it      3     sg
## 6      we      1     pl
## 8    they      3     pl

8 Exercise 08

Generate a vector FREQS of the frequencies with which the personal pronouns in EPP occurred in a small corpus: I: 8426, you: 9462, he: 6394, she: 4234, it: 6040, we: 2305, you: 8078, they: 2998. Then, make this vector the fourth column of EPP.

EPP$FREQS <- c(8426, 9462, 6394, 4234, 6040, 2305, 8078, 2998) # or
EPP[,"FREQS"] <- c(8426, 9462, 6394, 4234, 6040, 2305, 8078, 2998)

9 Exercise 09

Save the data frame EPP into <inputfiles/201_03_dataframe3.csv> such that you can easily load/edit in a spreadsheet software.

write.table(EPP,                 # write the data frame EPP
   sep="\t", eol="\n",           # with tabs between columns & line breaks
   row.names=FALSE, quote=FALSE, # no row names & quotes (for factors)
   file="inputfiles/201_03_dataframe3.csv") # into this file

10 Exercise 10

The file <inputfiles/201_03_dataframe4.csv> contains data for the VERB into VERBing construction in the BNC (e.g., He [V1 forced] him into [V2 speaking] about it). For each instance of one such construction, the file contains

  • the column BNC: the file where the instance was found (e.g., A06 in row 1);
  • the column VERB_LEMMA: the lemma of the finite verb (e.g., force in row 1);
  • the column ING_FORM: the gerund (e.g., speaking in row 1)
  • the column ING_TAG: the part-of-speech tag of the gerund (e.g., VVG in row 1);
  • the column ING_LEMMA: the lemma of the gerund (e.g., speak in row 1).

Load this file into a data frame x and check the loading was successful.

summary(x <- read.delim(   # summarize the data structure x imported
   "inputfiles/201_03_dataframe4.csv", # from this file
   stringsAsFactors=TRUE)) # and make categorical variables factors (!)
##       BNC         VERB_LEMMA        ING_FORM       ING_TAG       ING_LEMMA   
##  HH3    :  11   force  : 101   thinking : 146   VVG    :1239   think  : 147  
##  K5D    :  11   trick  :  92   believing: 104   NN1-VVG: 158   believe: 104  
##  CBG    :  10   fool   :  77   making   :  62   AJ0-VVG: 108   make   :  62  
##  EUU    :  10   talk   :  62   giving   :  54   VDG    :  49   give   :  54  
##  HGM    :  10   mislead:  57   accepting:  51   VBG    :  23   accept :  51  
##  HXE    :  10   coerce :  52   doing    :  49   VHG    :  15   do     :  49  
##  (Other):1538   (Other):1159   (Other)  :1134   (Other):   8   (Other):1133
head(x) # look at the first 6 rows
##   BNC VERB_LEMMA ING_FORM ING_TAG ING_LEMMA
## 1 A06      force speaking     VVG     speak
## 2 A08      nudge    being     VBG        be
## 3 A0C       talk   taking     VVG      take
## 4 A0F      bully   taking     VVG      take
## 5 A0H  influence   trying     VVG       try
## 6 A0H     delude thinking     VVG     think

11 Exercise 11

  • What is the quickest way of identifying the numbers of verb lemma types and -ing lemma types? (don’t just use str!)
  • What is the most frequent verb lemma? (don’t just use summary!)
  • What is the most frequent -ing lemma with this verb lemma?
# 1st question
str(x) # or
## 'data.frame':    1600 obs. of  5 variables:
##  $ BNC       : Factor w/ 929 levels "A06","A08","A0C",..: 1 2 3 4 5 5 6 6 7 8 ...
##  $ VERB_LEMMA: Factor w/ 208 levels "activate","aggravate",..: 76 126 186 26 96 51 75 149 152 186 ...
##  $ ING_FORM  : Factor w/ 422 levels "abandoning","abdicating",..: 354 49 382 382 395 387 387 133 209 175 ...
##  $ ING_TAG   : Factor w/ 10 levels "AJ0-NN1","AJ0-VVG",..: 10 7 10 10 10 10 10 1 10 10 ...
##  $ ING_LEMMA : Factor w/ 416 levels "abandon","abdicate",..: 349 41 377 377 389 382 382 377 207 173 ...
length(                # how many
   unique(x$VERB_LEMMA)) # different types/levels of VERB_LEMMA are there?
## [1] 208
length(                # how many
   unique(x$ING_LEMMA))  # unique types/levels of ING_LEMMA  are there?
## [1] 416
# 2nd question
names(tail(sort(         # show the name of the tail of the sorted
   table(x$VERB_LEMMA)), # frequency table of VERB_LEMMA
1))                      # namely the last item
## [1] "force"
# 3rd question
names(tail(sort(              # the name of the end of the sorted
   table(x$ING_LEMMA[         # frequency table of ING_LEMMA where
      x$VERB_LEMMA=="force"]) # VERB_LEMMA is "force"
), 1)) # end of sort(), then end of tail(), then end of names()
## [1] "make"

Here’s a great alternative for the 2nd question using the pipe (%>%) from the package magrittr:

library(magrittr)
x$VERB_LEMMA %>% table %>% sort %>% tail(1) %>% names
## [1] "force"

12 Exercise 12

  • Delete the column with the corpus files; the new data frame is to be called xx;
  • delete the rows with the four rarest tags; the new data frame is to be called xxx;
  • from xxx, create a new data frame xxxx which is sorted according to
    • the column VERB_LEMMA (ascending) and, within that,
    • the column the ING_LEMMA (descending);
  • save the changed data frame into a tab-delimited text file <inputfiles/201_03_dataframe5.csv> such that you can easily load/edit in a spreadsheet software.
xx <- x[,2:5]

# step 1: determine the rarest tags
sort(                    # sort
   table(xx$ING_TAG)) # the frequency table of ING_TAG (note the COV.2$)
## 
## AJ0-NN1     CJS     UNC     NN1     VHG     VBG     VDG AJ0-VVG NN1-VVG     VVG 
##       1       1       2       4      15      23      49     108     158    1239
# step 2: determine the vector of deletees
deletees <- which(         # the deletees are where
   xx$ING_TAG=="AJ0-NN1" | # xx$ING_TAG is "AJ0-NN1" or where
   xx$ING_TAG=="CJS" |     # xx$ING_TAG is "CJS" or where
   xx$ING_TAG=="UNC" |     # xx$ING_TAG is "UNC" or where
   xx$ING_TAG=="NN1")      # xx$ING_TAG is "NN1"
# step 3: delete
xxx <- xx[-deletees,]

order.index <- order(     # make order.index the order
   xxx$VERB_LEMMA,        # to sort by xxx$VERB_LEMMA, then break ties by
   -rank(xxx$ING_LEMMA))  # (reversed) xxx$ING_LEMMA
xxxx <- xxx[order.index,] # make xxxx the re-sorted data frame

write.table(xxxx,                # write the data frame xxxx
   sep="\t", eol="\n",           # with tabs between columns & line breaks
   row.names=FALSE, quote=FALSE, # no row names & quotes (for factors)
   file="inputfiles/201_03_dataframe5.csv") # into this file
# A more advanced alternative for the second task:
xxx <- xx[(!(xx$TAG_ING %in% names(sort(table(xx$TAG_ING)))[1:4])),]
sessionInfo()
## R version 4.2.2 Patched (2022-11-10 r83330)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Pop!_OS 22.04 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   compiler 
## [8] base     
## 
## other attached packages:
## [1] magrittr_2.0.3
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.31   R6_2.5.1        lifecycle_1.0.3 jsonlite_1.8.4 
##  [5] evaluate_0.19   stringi_1.7.12  cachem_1.0.6    rlang_1.0.6    
##  [9] cli_3.6.0       rstudioapi_0.14 jquerylib_0.1.4 bslib_0.4.2    
## [13] vctrs_0.5.1     rmarkdown_2.19  tools_4.2.2     stringr_1.5.0  
## [17] glue_1.6.2      xfun_0.36       yaml_2.3.6      fastmap_1.1.0  
## [21] htmltools_0.5.4 knitr_1.41      sass_0.4.4

13 Homework

To prepare for next week, read (and work through!) SFLWR3: Sections 3.1 and 3.5.3.