9 Week 5 Demo

9.1 Setup

First, we’ll load the packages we’ll be using in this week’s brief demo.

#devtools::install_github("conjugateprior/austin")
library(austin)
library(quanteda)
library(quanteda.textstats)

9.2 Wordscores

We can inspect the function for the wordscores model by Laver et al. (2003) in the following way:

classic.wordscores
## function (wfm, scores) 
## {
##     if (!is.wfm(wfm)) 
##         stop("Function not applicable to this object")
##     if (length(scores) != length(docs(wfm))) 
##         stop("There are not the same number of documents as scores")
##     if (any(is.na(scores))) 
##         stop("One of the reference document scores is NA\nFit the model with known scores and use 'predict' to get virgin score estimates")
##     thecall <- match.call()
##     C.all <- as.worddoc(wfm)
##     C <- C.all[rowSums(C.all) > 0, ]
##     F <- scale(C, center = FALSE, scale = colSums(C))
##     ws <- apply(F, 1, function(x) {
##         sum(scores * x)
##     })/rowSums(F)
##     pi <- matrix(ws, nrow = length(ws))
##     rownames(pi) <- rownames(C)
##     colnames(pi) <- c("Score")
##     val <- list(pi = pi, theta = scores, data = wfm, call = thecall)
##     class(val) <- c("classic.wordscores", "wordscores", class(val))
##     return(val)
## }
## <bytecode: 0x1265783c8>
## <environment: namespace:austin>

We can then take some example data included in the austin package.

data(lbg)
ref <- getdocs(lbg, 1:5)

And here our reference documents are all those documents marked with an “R” for reference; i.e., columns one to five.

ref
##      docs
## words  R1  R2  R3  R4  R5
##    A    2   0   0   0   0
##    B    3   0   0   0   0
##    C   10   0   0   0   0
##    D   22   0   0   0   0
##    E   45   0   0   0   0
##    F   78   2   0   0   0
##    G  115   3   0   0   0
##    H  146  10   0   0   0
##    I  158  22   0   0   0
##    J  146  45   0   0   0
##    K  115  78   2   0   0
##    L   78 115   3   0   0
##    M   45 146  10   0   0
##    N   22 158  22   0   0
##    O   10 146  45   0   0
##    P    3 115  78   2   0
##    Q    2  78 115   3   0
##    R    0  45 146  10   0
##    S    0  22 158  22   0
##    T    0  10 146  45   0
##    U    0   3 115  78   2
##    V    0   2  78 115   3
##    W    0   0  45 146  10
##    X    0   0  22 158  22
##    Y    0   0  10 146  45
##    Z    0   0   3 115  78
##    ZA   0   0   2  78 115
##    ZB   0   0   0  45 146
##    ZC   0   0   0  22 158
##    ZD   0   0   0  10 146
##    ZE   0   0   0   3 115
##    ZF   0   0   0   2  78
##    ZG   0   0   0   0  45
##    ZH   0   0   0   0  22
##    ZI   0   0   0   0  10
##    ZJ   0   0   0   0   3
##    ZK   0   0   0   0   2

This matrix is simply a series of words (here: letters) and reference texts with word counts for each of them.

We can then look at the wordscores for each of the words, calculated using the reference dimensions for each of the reference documents.

ws <- classic.wordscores(ref, scores=seq(-1.5,1.5,by=0.75))
ws
## $pi
##         Score
## A  -1.5000000
## B  -1.5000000
## C  -1.5000000
## D  -1.5000000
## E  -1.5000000
## F  -1.4812500
## G  -1.4809322
## H  -1.4519231
## I  -1.4083333
## J  -1.3232984
## K  -1.1846154
## L  -1.0369898
## M  -0.8805970
## N  -0.7500000
## O  -0.6194030
## P  -0.4507576
## Q  -0.2992424
## R  -0.1305970
## S   0.0000000
## T   0.1305970
## U   0.2992424
## V   0.4507576
## W   0.6194030
## X   0.7500000
## Y   0.8805970
## Z   1.0369898
## ZA  1.1846154
## ZB  1.3232984
## ZC  1.4083333
## ZD  1.4519231
## ZE  1.4809322
## ZF  1.4812500
## ZG  1.5000000
## ZH  1.5000000
## ZI  1.5000000
## ZJ  1.5000000
## ZK  1.5000000
## 
## $theta
## [1] -1.50 -0.75  0.00  0.75  1.50
## 
## $data
##      docs
## words  R1  R2  R3  R4  R5
##    A    2   0   0   0   0
##    B    3   0   0   0   0
##    C   10   0   0   0   0
##    D   22   0   0   0   0
##    E   45   0   0   0   0
##    F   78   2   0   0   0
##    G  115   3   0   0   0
##    H  146  10   0   0   0
##    I  158  22   0   0   0
##    J  146  45   0   0   0
##    K  115  78   2   0   0
##    L   78 115   3   0   0
##    M   45 146  10   0   0
##    N   22 158  22   0   0
##    O   10 146  45   0   0
##    P    3 115  78   2   0
##    Q    2  78 115   3   0
##    R    0  45 146  10   0
##    S    0  22 158  22   0
##    T    0  10 146  45   0
##    U    0   3 115  78   2
##    V    0   2  78 115   3
##    W    0   0  45 146  10
##    X    0   0  22 158  22
##    Y    0   0  10 146  45
##    Z    0   0   3 115  78
##    ZA   0   0   2  78 115
##    ZB   0   0   0  45 146
##    ZC   0   0   0  22 158
##    ZD   0   0   0  10 146
##    ZE   0   0   0   3 115
##    ZF   0   0   0   2  78
##    ZG   0   0   0   0  45
##    ZH   0   0   0   0  22
##    ZI   0   0   0   0  10
##    ZJ   0   0   0   0   3
##    ZK   0   0   0   0   2
## 
## $call
## classic.wordscores(wfm = ref, scores = seq(-1.5, 1.5, by = 0.75))
## 
## attr(,"class")
## [1] "classic.wordscores" "wordscores"         "list"

And here we see the thetas contained in this wordscores object, i.e., the reference dimensions for each of the reference documents and the pis, i.e., the estimated wordscores for each word.

We can now use these to score the so-called “virgin” texts as follows.

#get "virgin" documents
vir <- getdocs(lbg, 'V1')
vir
##      docs
## words  V1
##    A    0
##    B    0
##    C    0
##    D    0
##    E    0
##    F    0
##    G    0
##    H    2
##    I    3
##    J   10
##    K   22
##    L   45
##    M   78
##    N  115
##    O  146
##    P  158
##    Q  146
##    R  115
##    S   78
##    T   45
##    U   22
##    V   10
##    W    3
##    X    2
##    Y    0
##    Z    0
##    ZA   0
##    ZB   0
##    ZC   0
##    ZD   0
##    ZE   0
##    ZF   0
##    ZG   0
##    ZH   0
##    ZI   0
##    ZJ   0
##    ZK   0
# predict textscores for the virgin documents
predict(ws, newdata=vir)
## 37 of 37 words (100%) are scorable
## 
##     Score Std. Err. Rescaled  Lower  Upper
## V1 -0.448    0.0119   -0.448 -0.459 -0.437

9.3 Wordfish

If we wish, we can inspect the function for the wordscores model by Slapin and Proksch (2008) in the following way. This is a much more complex algorithm, which is not printed here, but you can inspect on your own devices.

wordfish

We can then simulate some data, formatted appropriately for wordfiash estimation in the following way:

dd <- sim.wordfish()

dd
## $Y
##      docs
## words D01 D02 D03 D04 D05 D06 D07 D08 D09 D10
##   W01  14  16  24  18  17  14   8   8   5   5
##   W02  25  27  25  22  10   9  11   6   7   4
##   W03  26  20  21  18  18  10  10   5  10   4
##   W04  16  15  13  17  18   9   5  12   7   3
##   W05  21  18  20  13  17  15  11   3   6   4
##   W06   6   7  11  14   8  15  16  13  17  22
##   W07   3   9  10  10  10  14  20  23  23  20
##   W08   4   9   5   9  15  13  15  15  18  24
##   W09   5   4   6   7   9  18  16  33  19  28
##   W10   6   9   6   7  14  10  15  16  22  20
##   W11  58  54  34  46  50  41  31  25  16  11
##   W12  58  63  58  51  40  32  31  23  20  15
##   W13  57  59  46  58  33  30  23  25  17  12
##   W14  48  57  47  43  47  33  31  26  10  15
##   W15  69  55  57  46  31  37  28  23  27  17
##   W16  18  24  18  18  30  36  44  36  56  60
##   W17  17  12  21  20  38  36  55  41  58  60
##   W18  14  14  25  30  35  37  51  58  44  52
##   W19  16  16  30  33  32  41  48  56  61  55
##   W20  19  12  23  20  28  50  31  53  57  69
## 
## $theta
##  [1] -1.4863011 -1.1560120 -0.8257228 -0.4954337 -0.1651446  0.1651446
##  [7]  0.4954337  0.8257228  1.1560120  1.4863011
## 
## $doclen
## D01 D02 D03 D04 D05 D06 D07 D08 D09 D10 
## 500 500 500 500 500 500 500 500 500 500 
## 
## $psi
##  [1] 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
## 
## $beta
##  [1] 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1
## 
## attr(,"class")
## [1] "wordfish.simdata" "list"

Here we can see the document and word-level FEs, as well as the specified range of the thetas to be estimates.

Then estimating the document positions is simply a matter of implementing this algorithm.

wf <- wordfish(dd$Y)
summary(wf)
## Call:
##  wordfish(wfm = dd$Y)
## 
## Document Positions:
##     Estimate Std. Error    Lower    Upper
## D01  -1.2876    0.10926 -1.50176 -1.07345
## D02  -1.2137    0.10717 -1.42379 -1.00368
## D03  -0.8169    0.09816 -1.00930 -0.62453
## D04  -0.6986    0.09617 -0.88709 -0.51013
## D05  -0.2206    0.09124 -0.39945 -0.04182
## D06   0.2022    0.09093  0.02394  0.38040
## D07   0.5068    0.09306  0.32438  0.68918
## D08   0.8551    0.09795  0.66312  1.04707
## D09   1.1158    0.10337  0.91324  1.31845
## D10   1.5565    0.11616  1.32880  1.78414

9.4 Using quanteda

We can also use quanteda to implement the same scaling techniques, as demonstrated in Exercise 4.