Generate a data frame abc
that contains the letters from
a to j in the first column and the integers from 10 to
1 in the second column. Make sure the first column is called
LETTER
and the second NUMBER
.
# solution 1:
LETTER <- # create a data structure called LETTER with
letters[1:10] # the first 10 elements of the inbuilt vector letters
NUMBER <- # create a data structure called NUMBER with
10:1 # with the integers from 10 to 1 in it
abc <- data.frame( # create a data frame abc
LETTER, # with LETTER as the 1st column
NUMBER, # with NUMBER as the 1st column
stringsAsFactors=TRUE) # and make categorical variables factors (!)
# solution 2 (in one go, w/out creating LETTER & NUMBER separately first)
abc <- data.frame( # create a data frame abc
LETTER=letters[1:10], # with LETTER as the 1st column, w/ letters from a to j in there
NUMBER=10:1, # with NUMBER as the 2nd column, w/ numbers from 10 to 1 in there
stringsAsFactors=TRUE) # and make categorical variables factors (!)
Load the text file <inputfiles/201_03_dataframe1.csv> into a
data frame example
and check the loading was
successful.
summary(example <- read.delim( # summarize the data structure example read
"inputfiles/201_03_dataframe1.csv", # from this file
stringsAsFactors=TRUE)) # w/ categorical variables as factors (!)
## CASE GRMRELATION LENGTH DEFINITENESS
## Min. : 1.00 obj :6 Min. : 2.000 def :6
## 1st Qu.: 3.75 subj:6 1st Qu.: 3.750 indef:6
## Median : 6.50 Median : 7.000
## Mean : 6.50 Mean : 6.417
## 3rd Qu.: 9.25 3rd Qu.: 9.000
## Max. :12.00 Max. :10.000
Extract from this data frame
# 1st question
example$GRMRELATION # column 2: individually
## [1] obj obj obj obj obj obj subj subj subj subj subj subj
## Levels: obj subj
example$LENGTH # column 3: individually
## [1] 2 2 10 6 7 4 3 9 9 9 7 9
example[,2:3] # columns 2 & 3 jointly (way 1)
## GRMRELATION LENGTH
## 1 obj 2
## 2 obj 2
## 3 obj 10
## 4 obj 6
## 5 obj 7
## 6 obj 4
## 7 subj 3
## 8 subj 9
## 9 subj 9
## 10 subj 9
## 11 subj 7
## 12 subj 9
example[,c("GRMRELATION", "LENGTH")] # columns 2 & 3 jointly (way 2)
## GRMRELATION LENGTH
## 1 obj 2
## 2 obj 2
## 3 obj 10
## 4 obj 6
## 5 obj 7
## 6 obj 4
## 7 subj 3
## 8 subj 9
## 9 subj 9
## 10 subj 9
## 11 subj 7
## 12 subj 9
# 2nd question
example[3:4,]
## CASE GRMRELATION LENGTH DEFINITENESS
## 3 3 obj 10 def
## 4 4 obj 6 indef
Split the data frame example
up according to the content
of the second column (enter ?split
at the R prompt for
help).
# option 1
split(example, # split up the data frame example
example$GRMRELATION) # depending on the values of the column GRMRELATION
## $obj
## CASE GRMRELATION LENGTH DEFINITENESS
## 1 1 obj 2 def
## 2 2 obj 2 def
## 3 3 obj 10 def
## 4 4 obj 6 indef
## 5 5 obj 7 indef
## 6 6 obj 4 indef
##
## $subj
## CASE GRMRELATION LENGTH DEFINITENESS
## 7 7 subj 3 def
## 8 8 subj 9 def
## 9 9 subj 9 def
## 10 10 subj 9 indef
## 11 11 subj 7 indef
## 12 12 subj 9 indef
# option 2
split(example, # split up the data frame example
example[,"GRMRELATION"]) # depending on the values of the column GRMRELATION
## $obj
## CASE GRMRELATION LENGTH DEFINITENESS
## 1 1 obj 2 def
## 2 2 obj 2 def
## 3 3 obj 10 def
## 4 4 obj 6 indef
## 5 5 obj 7 indef
## 6 6 obj 4 indef
##
## $subj
## CASE GRMRELATION LENGTH DEFINITENESS
## 7 7 subj 3 def
## 8 8 subj 9 def
## 9 9 subj 9 def
## 10 10 subj 9 indef
## 11 11 subj 7 indef
## 12 12 subj 9 indef
# option 3
split(example, # split up the data frame example
example[,2]) # depending on the values of the 2nd column
## $obj
## CASE GRMRELATION LENGTH DEFINITENESS
## 1 1 obj 2 def
## 2 2 obj 2 def
## 3 3 obj 10 def
## 4 4 obj 6 indef
## 5 5 obj 7 indef
## 6 6 obj 4 indef
##
## $subj
## CASE GRMRELATION LENGTH DEFINITENESS
## 7 7 subj 3 def
## 8 8 subj 9 def
## 9 9 subj 9 def
## 10 10 subj 9 indef
## 11 11 subj 7 indef
## 12 12 subj 9 indef
Here are separate manual alternatives:
subset(example, # show a subset of the data frame example, namely
example$GRMRELATION=="obj") # when GRMRELATION is "obj"
## CASE GRMRELATION LENGTH DEFINITENESS
## 1 1 obj 2 def
## 2 2 obj 2 def
## 3 3 obj 10 def
## 4 4 obj 6 indef
## 5 5 obj 7 indef
## 6 6 obj 4 indef
subset(example, # show a subset of the data frame example, namely
example$GRMRELATION=="subj") # when GRMRELATION is "subj"
## CASE GRMRELATION LENGTH DEFINITENESS
## 7 7 subj 3 def
## 8 8 subj 9 def
## 9 9 subj 9 def
## 10 10 subj 9 indef
## 11 11 subj 7 indef
## 12 12 subj 9 indef
Change the value at the intersection of the third row and the fourth column into “indef” and save the changed data frame into <inputfiles/201_03_dataframe2.csv> such that you can easily load/edit in a spreadsheet software.
example[3,4] <- "indef"
write.table( # write the data frame
example, # example
sep="\t", eol="\n", # with tabs between columns & line breaks
row.names=FALSE, quote=FALSE, # no row names & quotes (for factors)
file="inputfiles/201_03_dataframe2.csv") # into this file
Generate the following data frame and call it EPP
(for
English personal pronouns) […]
PRONOUN | PERSON | NUMBER |
---|---|---|
I | 1 | sg |
you | 2 | sg |
he | 3 | sg |
she | 3 | sg |
it | 3 | sg |
we | 1 | pl |
you | 2 | pl |
they | 3 | pl |
(EPP <- data.frame(
PRONOUN=c("I", "you", "he", "she", "it", "we", "you", "they"),
PERSON=c(1, 2, 3, 3, 3, 1, 2, 3),
NUMBER=rep(c("sg", "pl"), c(5, 3)),
stringsAsFactors=TRUE))
## PRONOUN PERSON NUMBER
## 1 I 1 sg
## 2 you 2 sg
## 3 he 3 sg
## 4 she 3 sg
## 5 it 3 sg
## 6 we 1 pl
## 7 you 2 pl
## 8 they 3 pl
Extract from this data frame EPP
# 1st question
EPP[4,2]
## [1] 3
# 2nd question
EPP[3:4,1:2]
## PRONOUN PERSON
## 3 he 3
## 4 she 3
# 3rd question
EPP[EPP$NUMBER=="pl",] # or
## PRONOUN PERSON NUMBER
## 6 we 1 pl
## 7 you 2 pl
## 8 they 3 pl
subset( # show a subset
EPP, # of the data frame EPP, namely
EPP$NUMBER=="pl") # when NUMBER is "pl"
## PRONOUN PERSON NUMBER
## 6 we 1 pl
## 7 you 2 pl
## 8 they 3 pl
# 4th question
EPP[(EPP$PERSON==1 # of EPP, the rows when PERSON is 1
| # or
EPP$PERSON==3),] # when PERSON is 3
## PRONOUN PERSON NUMBER
## 1 I 1 sg
## 3 he 3 sg
## 4 she 3 sg
## 5 it 3 sg
## 6 we 1 pl
## 8 they 3 pl
EPP[EPP$PERSON!=2,] # of EPP, the rows when PERSON is not 2
## PRONOUN PERSON NUMBER
## 1 I 1 sg
## 3 he 3 sg
## 4 she 3 sg
## 5 it 3 sg
## 6 we 1 pl
## 8 they 3 pl
subset( # show a subset
EPP, # of the data frame EPP, namely
EPP$PERSON %in% c(1, 3)) # when PERSON is in the set 1, 3
## PRONOUN PERSON NUMBER
## 1 I 1 sg
## 3 he 3 sg
## 4 she 3 sg
## 5 it 3 sg
## 6 we 1 pl
## 8 they 3 pl
Generate a vector FREQS
of the frequencies with which
the personal pronouns in EPP
occurred in a small corpus: I:
8426, you: 9462, he: 6394, she: 4234, it: 6040, we: 2305, you: 8078,
they: 2998. Then, make this vector the fourth column of
EPP
.
EPP$FREQS <- c(8426, 9462, 6394, 4234, 6040, 2305, 8078, 2998) # or
EPP[,"FREQS"] <- c(8426, 9462, 6394, 4234, 6040, 2305, 8078, 2998)
Save the data frame EPP
into
<inputfiles/201_03_dataframe3.csv> such that you can easily
load/edit in a spreadsheet software.
write.table(EPP, # write the data frame EPP
sep="\t", eol="\n", # with tabs between columns & line breaks
row.names=FALSE, quote=FALSE, # no row names & quotes (for factors)
file="inputfiles/201_03_dataframe3.csv") # into this file
The file <inputfiles/201_03_dataframe4.csv> contains data for the VERB into VERBing construction in the BNC (e.g., He [V1 forced] him into [V2 speaking] about it). For each instance of one such construction, the file contains
BNC
: the file where the instance was found
(e.g., A06 in row 1);VERB_LEMMA
: the lemma of the finite verb
(e.g., force in row 1);ING_FORM
: the gerund (e.g.,
speaking in row 1)ING_TAG
: the part-of-speech tag of the
gerund (e.g., VVG in row 1);ING_LEMMA
: the lemma of the gerund (e.g.,
speak in row 1).Load this file into a data frame x
and check the loading
was successful.
summary(x <- read.delim( # summarize the data structure x imported
"inputfiles/201_03_dataframe4.csv", # from this file
stringsAsFactors=TRUE)) # and make categorical variables factors (!)
## BNC VERB_LEMMA ING_FORM ING_TAG ING_LEMMA
## HH3 : 11 force : 101 thinking : 146 VVG :1239 think : 147
## K5D : 11 trick : 92 believing: 104 NN1-VVG: 158 believe: 104
## CBG : 10 fool : 77 making : 62 AJ0-VVG: 108 make : 62
## EUU : 10 talk : 62 giving : 54 VDG : 49 give : 54
## HGM : 10 mislead: 57 accepting: 51 VBG : 23 accept : 51
## HXE : 10 coerce : 52 doing : 49 VHG : 15 do : 49
## (Other):1538 (Other):1159 (Other) :1134 (Other): 8 (Other):1133
head(x) # look at the first 6 rows
## BNC VERB_LEMMA ING_FORM ING_TAG ING_LEMMA
## 1 A06 force speaking VVG speak
## 2 A08 nudge being VBG be
## 3 A0C talk taking VVG take
## 4 A0F bully taking VVG take
## 5 A0H influence trying VVG try
## 6 A0H delude thinking VVG think
str
!)summary
!)# 1st question
str(x) # or
## 'data.frame': 1600 obs. of 5 variables:
## $ BNC : Factor w/ 929 levels "A06","A08","A0C",..: 1 2 3 4 5 5 6 6 7 8 ...
## $ VERB_LEMMA: Factor w/ 208 levels "activate","aggravate",..: 76 126 186 26 96 51 75 149 152 186 ...
## $ ING_FORM : Factor w/ 422 levels "abandoning","abdicating",..: 354 49 382 382 395 387 387 133 209 175 ...
## $ ING_TAG : Factor w/ 10 levels "AJ0-NN1","AJ0-VVG",..: 10 7 10 10 10 10 10 1 10 10 ...
## $ ING_LEMMA : Factor w/ 416 levels "abandon","abdicate",..: 349 41 377 377 389 382 382 377 207 173 ...
length( # how many
unique(x$VERB_LEMMA)) # different types/levels of VERB_LEMMA are there?
## [1] 208
length( # how many
unique(x$ING_LEMMA)) # unique types/levels of ING_LEMMA are there?
## [1] 416
# 2nd question
names(tail(sort( # show the name of the tail of the sorted
table(x$VERB_LEMMA)), # frequency table of VERB_LEMMA
1)) # namely the last item
## [1] "force"
# 3rd question
names(tail(sort( # the name of the end of the sorted
table(x$ING_LEMMA[ # frequency table of ING_LEMMA where
x$VERB_LEMMA=="force"]) # VERB_LEMMA is "force"
), 1)) # end of sort(), then end of tail(), then end of names()
## [1] "make"
Here’s a great alternative for the 2nd question using the pipe
(%>%
) from the package magrittr
:
library(magrittr)
x$VERB_LEMMA %>% table %>% sort %>% tail(1) %>% names
## [1] "force"
xx
;xxx
;xxx
, create a new data frame xxxx
which is sorted according to
VERB_LEMMA
(ascending) and, within
that,ING_LEMMA
(descending);xx <- x[,2:5]
# step 1: determine the rarest tags
sort( # sort
table(xx$ING_TAG)) # the frequency table of ING_TAG (note the COV.2$)
##
## AJ0-NN1 CJS UNC NN1 VHG VBG VDG AJ0-VVG NN1-VVG VVG
## 1 1 2 4 15 23 49 108 158 1239
# step 2: determine the vector of deletees
deletees <- which( # the deletees are where
xx$ING_TAG=="AJ0-NN1" | # xx$ING_TAG is "AJ0-NN1" or where
xx$ING_TAG=="CJS" | # xx$ING_TAG is "CJS" or where
xx$ING_TAG=="UNC" | # xx$ING_TAG is "UNC" or where
xx$ING_TAG=="NN1") # xx$ING_TAG is "NN1"
# step 3: delete
xxx <- xx[-deletees,]
order.index <- order( # make order.index the order
xxx$VERB_LEMMA, # to sort by xxx$VERB_LEMMA, then break ties by
-rank(xxx$ING_LEMMA)) # (reversed) xxx$ING_LEMMA
xxxx <- xxx[order.index,] # make xxxx the re-sorted data frame
write.table(xxxx, # write the data frame xxxx
sep="\t", eol="\n", # with tabs between columns & line breaks
row.names=FALSE, quote=FALSE, # no row names & quotes (for factors)
file="inputfiles/201_03_dataframe5.csv") # into this file
# A more advanced alternative for the second task:
xxx <- xx[(!(xx$TAG_ING %in% names(sort(table(xx$TAG_ING)))[1:4])),]
sessionInfo()
## R version 4.2.2 Patched (2022-11-10 r83330)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Pop!_OS 22.04 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods compiler
## [8] base
##
## other attached packages:
## [1] magrittr_2.0.3
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.31 R6_2.5.1 lifecycle_1.0.3 jsonlite_1.8.4
## [5] evaluate_0.19 stringi_1.7.12 cachem_1.0.6 rlang_1.0.6
## [9] cli_3.6.0 rstudioapi_0.14 jquerylib_0.1.4 bslib_0.4.2
## [13] vctrs_0.5.1 rmarkdown_2.19 tools_4.2.2 stringr_1.5.0
## [17] glue_1.6.2 xfun_0.36 yaml_2.3.6 fastmap_1.1.0
## [21] htmltools_0.5.4 knitr_1.41 sass_0.4.4