-
Notifications
You must be signed in to change notification settings - Fork 0
/
README.Rmd
892 lines (709 loc) · 53.5 KB
/
README.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
---
title: "Agent-Based Model of Cattle Brands"
author: Mason Youngblood
output: github_document
---
The associated preregistration document can be found on [OSF](https://osf.io/d4qv3/), and a detailed description of the structure of the cattle brand data can be found on the [corresponding GitHub](https://github.com/masonyoungblood/cattle_brand_data).
```{r echo=FALSE, fig.align="center", out.width="50%"}
knitr::include_graphics("https://images.unsplash.com/photo-1590249351139-c6effec78d2a?ixid=MnwxMjA3fDB8MHxzZWFyY2h8MjV8fGJhYnklMjBjb3dzfGVufDB8fDB8fA%3D%3D&ixlib=rb-1.2.1&w=1000&q=80")
```
## Load and prepare brands data
First let's read in and clean up our brand data.
```{r}
#read in brand data
brands <- read.csv("brand_data.csv")[, -1]
#remove brands with length of 12 (incorrectly specified according to manual inspection)
brands <- brands[-which(nchar(brands$brand) == 12), ]
#substring brand codes into vector of four components and one location
brands$brand <- lapply(1:nrow(brands), function(x){substring(brands$brand[[x]], first = c(1, 4, 7, 10, 13), last = c(3, 6, 9, 12, 13))})
#print a sample
brands[1:10, ]
```
Each three digit code corresponds to a component in the brand, and the final single digit corresponds to the location of the brand on the animal. Note that the extra commas at the end of each brand component are just part of the data frame output. Some brands have a single digit in the 12th digit before the location digit to separate duplicates, where the same components are arranged in different ways. Let's extract those and put them into the sixth position of the brand vectors.
```{r eval=FALSE}
#move duplicate code (12th position) into the sixth position of the brand vectors
for(x in 1:nrow(brands)){
#if fourth position starts with a comma and ends with a number (or O)
if(substring(brands$brand[[x]][4], 1, 1) == "," & substring(brands$brand[[x]][4], 3) %in% c("O", 0:9)){
#put duplicate number into the sixth position
brands$brand[[x]][6] <- gsub(",", "", brands$brand[[x]][4])
#replace any O with 0
brands$brand[[x]][6] <- gsub("O", 0, brands$brand[[x]][6])
#replace original fourth position with ",,,"
brands$brand[[x]][4] <- ",,,"
}
}
#save
save(brands, file = "raw_brands.RData")
```
```{r echo=FALSE}
load("raw_brands.RData")
```
Before we move any further we should determine whether letters in components tend to be initials or whether they should be treated as distinct. Here are the frequencies of letters in the brands.
```{r echo=FALSE, fig.align="center"}
#construct empty table for frequencies of letters in brands
brand_freq <- data.frame(letter = LETTERS, count = rep(0, length(letters)))
#for each brand
for(i in 1:nrow(brands)){
#get components that match letters (only first two characters since angles haven't been extracted yet)
matches <- which(substr(brands$brand[[i]], 1, 2) %in% paste0(LETTERS, ","))
#if there are matches
if(length(matches) > 0){
#store frequency table of them temporarily
temp <- table(substr(brands$brand[[i]], 1, 2)[matches])
#add those frequencies to the existing counts
brand_freq$count[match(names(temp), paste0(LETTERS, ","))] <- brand_freq$count[match(names(temp), paste0(LETTERS, ","))]+as.numeric(temp)
rm(temp)
}
rm(matches)
}
#plot brand data
ggplot2::ggplot(data = brand_freq, ggplot2::aes(x = letter, y = count)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::theme_linedraw() +
ggplot2::labs(x = "Letter", y = "Frequency", title = "Brands")
rm(brand_freq)
```
If we compare this with the distribution of first letters of surnames that appeared more than 100 times in the 2010 US Census we can see that the distributions are drastically different.
```{r echo=FALSE, fig.align="center"}
#load in all surnames that appeared more than 100 times in the 2010 US Census
census_names <- read.csv("Names_2010Census.csv")
#replace "(S)" in the census table with 0
breakdown_matrix <- as.matrix(census_names[, 6:11])
breakdown_matrix[which(breakdown_matrix == "(S)")] <- 0
census_names[, 6:11] <- breakdown_matrix
#replace census table with adjusted counts by race
census_names <- data.frame(letter = substr(census_names$name, 1, 1),
num_white = census_names$count*as.numeric(census_names$pctwhite),
num_black = census_names$count*as.numeric(census_names$pctblack),
num_api = census_names$count*as.numeric(census_names$pctapi),
num_na = census_names$count*as.numeric(census_names$pctaian),
num_mult = census_names$count*as.numeric(census_names$pct2prace),
num_hisp = census_names$count*as.numeric(census_names$pcthispanic))
#aggregate by letter
census_names <- aggregate(. ~ letter, data = census_names, sum)
#load in target racial breakdown for KS, which does not add up to 100% because of overlap (white, black, asian + pacific islander, native american, 2+, hispanic): https://www.census.gov/quickfacts/KS
ks_breakdown <- 2913314*c(0.863, 0.061, 0.033, 0.012, 0.031, 0.122)
#construct data frame for plotting
letter_data <- data.frame(letter = LETTERS,
name_freq = sapply(1:nrow(census_names), function(x){sum(census_names[x, 2:7])}),
adj_name_freq = sapply(1:nrow(census_names), function(x){sum(ks_breakdown/census_names[x, 2:7])}))
#plot name data
ggplot2::ggplot(data = letter_data, ggplot2::aes(x = letter, y = name_freq)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::theme_linedraw() +
ggplot2::labs(x = "Letter", y = "Frequency", title = "Surnames")
#plot adjusted name data
#ggplot2::ggplot(data = letter_data, ggplot2::aes(x = letter, y = adj_name_freq)) +
# ggplot2::geom_bar(stat = "identity") +
# ggplot2::theme_linedraw() +
# ggplot2::labs(x = "Letter", y = "Adjusted Frequency", title = "Surnames")
rm(list = c("census_names", "breakdown_matrix", "ks_breakdown", "letter_data"))
```
```{r eval=FALSE, echo=FALSE}
#get booleans for brands that contain letters
has_letter <- sapply(1:nrow(brands[which(brands$year == 2016), ]), function(x){
length(which(substr(brands[which(brands$year == 2016), ]$brand[[x]], 1, 2) %in% paste0(LETTERS, ","))) > 0
})
#randly sample 500 of these to check manually
letters_check <- brands[which(brands$year == 2016), ][sample(which(has_letter), 200, replace = FALSE), ]
#collapse brand codes for checking, some will be different from the book since we've already moved the duplicate indicator to the final position
letters_check$brand <- sapply(1:nrow(letters_check), function(x){paste0(letters_check$brand[[x]], collapse = "")})
#save
write.csv(letters_check, file = "letters_check.csv")
```
```{r}
#load in the checked brands with letters and get the proportion correct
prop_correct <- mean(read.csv("letters_checked.csv")$initials)*100
```
Weighting the US surname distribution so that it matches the ethnic composition of Kansas does not solve the issue, so we manually checked 200 random brands with letters in the 2016 brand book. In `r prop_correct`% of the random brands at least one of the letters corresponded to one or both of the first or last names of the owners or the ranch (with some letters modifying others, like A and L both appearing in a brand where the L is a "leg" on the right side of the A in a brand owned by the family Asmussen, and some characters combined to make a letter, like L and 7 combined into an S for Schneider).
Since a significant number of letters in brands are not initials, some letters (e.g. I and O) are also used to code shapes, and different letters have different possible rotation angles, we chose to keep them as separate categories. Numbers were also included as separate categories. Some of the symbols in the brand book have variants that are distinguished using the third digit that is usually used for rotation (e.g. AR, BB, MI, TK, and TR). We collapsed each set of these symbols into single categories that can be rotated just like any other component.
We made some further adjustments to Kansas' coding scheme by removing redundant components and restricting the possible angles of rotations for different symbols. All of these adjustments are stored in `components.xlsx`: the `collapse` sheet has which components should be converted to what along with their new angles, and the `rotation` sheet has which components can be rotated in which directions. For clarification, the possible rotation angles for each component are the first unique rotations for that shape. For example, the 1, 3, and 4 rotations for the letter A are all identical (upside down) so 1 is kept as the first unique rotation. The `rotation` sheet also has a column for each angle (1 to 9), where the value for each component is whether that angle is correct if it appears (NA), needs to be corrected to no rotation (0), or needs to be corrected to a different rotation (1 to 9).
All letter variations were converted to single categories. For example, A1 and A2, two stylistic variations of A with square and rounded tops, were both converted to A. The letter I is used primarily for vertical lines, so the other single line symbols (i.e. /, \\, and -) were converted to I (with their associated angles). 1, which could be easily mistaken for I, does not appear in the coding scheme. Double and triple lines are stored as separate symbols (i.e. = and -3). We chose to keep these as separate categories rather than convert them to multiple repetitions of I, as reducing components to their constituent parts introduces many other issues and could inflate the measured complexity (or number of components) of some brands.
), QC, and QC1 were all converted to ( (with their associated angles). ], BX1, and BX2 were all converted to [ (with their associated angles). DI (diamond) was converted to BX (box) with the associated angle. All of the ~ components (i.e. ~1 through ~6), which correspond to pairs of "squiggles" or "wings" that modify other components, do not seem to have systematic differences or angles and were combined into a single category without rotation.
MU, an unused code for music notes, was converted to NT, a code for notation that appears in the brand books. ^ was converted to A3, a code used for the same symbol in the brand books.
Several other simplifications were considered (with angles), such as converting + to X, 9 to 6, W to M, etc. For now we chose to keep these components separate.
```{r}
#load in adjustments from components.xlsx
collapse <- data.table::as.data.table(readxl::read_excel(paste0(dirname(rstudioapi::getSourceEditorContext()$path), "/", "components.xlsx"), sheet = "collapse"))
rotation <- data.table::as.data.table(readxl::read_excel(paste0(dirname(rstudioapi::getSourceEditorContext()$path), "/", "components.xlsx"), sheet = "rotation"))
#reformat rotation
rotation$rot <- strsplit(rotation$rot, ", ")
```
```{r eval=FALSE}
#for each brand
for(i in 1:nrow(brands)){
#check if there are characters that need replacing (either three digit characters like BX1 and QC1 or substringed characters without commas)
temp <- which(brands$brand[[i]][1:4] %in% collapse$from | gsub(",", "", substr(brands$brand[[i]][1:4], 1, 2)) %in% collapse$from)
#and if there are
if(length(temp) > 0){
#go through them
for(j in 1:length(temp)){
#if the full three-digit component is in the data table (so something like BX1, QC1, etc.)
if(brands$brand[[i]][temp[j]] %in% collapse$from){
#store the index of the component
temp_2 <- which(collapse$from == brands$brand[[i]][temp[j]])
#and either replace it without or with rotation
if(collapse$rot[temp_2] == "NA"){
brands$brand[[i]][temp[j]] <- stringr::str_pad(collapse$to[temp_2], width = 3, side = "right", pad = ",")
} else{
brands$brand[[i]][temp[j]] <- paste0(stringr::str_pad(collapse$to[temp_2], width = 2, side = "right", pad = ","), collapse$rot[temp_2])
}
} else{ #if it's less than three digits
#store the index of the component
temp_2 <- which(collapse$from == gsub(",", "", substr(brands$brand[[i]][temp[j]], 1, 2)))
#and either replace it without or with rotation
if(collapse$rot[temp_2] == "NA"){
brands$brand[[i]][temp[j]] <- stringr::str_pad(collapse$to[temp_2], width = 3, side = "right", pad = ",")
} else{
brands$brand[[i]][temp[j]] <- paste0(stringr::str_pad(collapse$to[temp_2], width = 2, side = "right", pad = ","), collapse$rot[temp_2])
}
}
rm(temp_2)
}
}
rm(temp)
}
```
To determine which angles are possible for each symbol we went through and assigned each one a five-digit binary code, where each digit corresponds to whether (0 or 1) each symbol matches itself when flipped horizontally, flipped vertically, rotated 90 degrees, rotated 180 degrees, and rotated 45 degrees. Then, for each five-digit code, we determined which rotation angles were possible. Some extra rotation angles were manually removed for character pairs like + and X, 9 and 6, and W and M, and some rotations angles had to be corrected when the binary code didn't quite capture the symbol, such as for %.
Now let's run through and correct any rotated components where the angle does not match the vector of first unique rotation angles for that component.
```{r eval=FALSE}
#for each brand
for(i in 1:nrow(brands)){
#check if there are rotated components
temp <- which(substr(brands$brand[[i]][1:4], 3, 3) %in% c(1:9))
#if there are
if(length(temp) > 0){
#go through them
for(j in 1:length(temp)){
#store the index of the component
temp_2 <- which(rotation$component == gsub(",", "", substr(brands$brand[[i]][temp[j]], 1, 2)))
#store the current angle of rotation
angle <- substr(brands$brand[[i]][temp[j]], 3, 3)
#if the rotated component matches a real component (checking since we haven't removed misspecified components yet)
if(length(temp_2) > 0){
#and the rotation angle is not in the first unique rotation angles for that component
if(!(angle %in% rotation$rot[[temp_2]])){
#and the new rotation is not zero
if(as.numeric(rotation[temp_2, ..angle]) > 0){
#replace the current rotated component with a corrected version
brands$brand[[i]][temp[j]] <- paste0(substr(brands$brand[[i]][temp[j]], 1, 2), as.numeric(rotation[temp_2, ..angle]))
} else{ #if the new rotation is zero
#remove the rotation entirely
brands$brand[[i]][temp[j]] <- paste0(substr(brands$brand[[i]][temp[j]], 1, 2), ",")
}
}
}
rm(list = c("temp_2", "angle"))
}
}
rm(temp)
}
```
Now we need to store the brand components. These include the components listed in the brand book indices, as well as some additions that appear in the real data (i.e. "TX,"). The "UN," component was excluded as it corresponds to unidentifiable symbols.
```{r}
#store components with commas
components <- stringr::str_pad(rotation$component, width = 3, side = "right", pad = ",")
#print them
components
```
We also need to generate all possible components, accounting for rotation, alongside index components in a data frame so rotated versions can be easily converted to non-rotated versions later on. Since we have already corrected incorrect angles we will only allow for angles that appear in the vector of first unique rotation angles for that component.
```{r eval=FALSE}
#create empty vectors to fill
all_poss_components <- c()
index_components <- c()
#iterate through components
for(x in 1:length(components)){
#if the component is not rotatable
if(length(which(rotation$rot[[x]] == "NA")) > 0){
#add it without rotation
all_poss_components <- c(all_poss_components, components[x])
index_components <- c(index_components, components[x])
} else{
#add it with rotation
all_poss_components <- c(all_poss_components, components[x], paste0(substr(components[x], 1, 2), rotation$rot[[x]]))
index_components <- c(index_components, components[x], rep(components[x], length(paste0(substr(components[x], 1, 2), rotation$rot[[x]]))))
}
}
#combine new vectors into a data frame and remove old variable
all_poss_components <- data.frame(index = index_components, rot = all_poss_components)
rm(index_components)
```
Now let's remove any misspecified brands with components that don't appear in `all_poss_components`.
```{r eval=FALSE}
#remove all misspecified brands (with components that aren't letters or don't appear in all possible components)
misspecified <- which(sapply(1:nrow(brands), function(x){
length(which(brands$brand[[x]][1:4] %in% c(all_poss_components$rot, ",,,")))
}) != 4)
brands <- brands[-misspecified,]
rm(misspecified)
```
Before we move any further, we should go ahead and retrieve our geographic data and subset our brands to only include those registered in the state of Kansas.
```{r eval=FALSE}
#get all kansas zip codes
data("zip_code_db", package = "zipcodeR")
zip_code_db <- zip_code_db[which(zip_code_db$state == "KS"), ]
#identify zip codes with missing location data
missing_locations <- which(is.na(zip_code_db$lat))
#save zip codes with missing locations, to manually fill as CSV file outside of R using Google Maps
#sink("location_data/missing_zips.txt")
#cat(zip_code_db$zipcode[missing_locations], sep = "\n")
#sink()
#add in found locations
found <- read.csv("location_data/missing_zips_found.txt", header = FALSE)
zip_code_db$lat[missing_locations] <- found$V2
zip_code_db$lng[missing_locations] <- found$V3
#construct all_zips, a data table with zip codes, counties, latitudes, and longitudes
all_zips <- data.table::data.table(zip = as.numeric(zip_code_db$zipcode), county = as.factor(zip_code_db$county), lat = zip_code_db$lat, lon = zip_code_db$lng)
#save
save(all_zips, file = "location_data/all_zips.RData")
#subset brands to only include those from kansas
brands <- brands[which(brands$location %in% all_zips$zip), ]
#remove temporary objects
rm(list = c("zip_code_db", "missing_locations", "found"))
```
For our agent-based model (ABM) we will also need the pairwise geodesic distances between all of the zip codes in Kansas, so let's go ahead and collect that (and save the object after, because it takes a while).
```{r eval=FALSE}
#calculate pairwise geodesic distances (in kilometers) between zip codes
zip_dists <- geodist::geodist(all_zips[, 3:4], measure = "geodesic")/1000
#add row and column names
colnames(zip_dists) <- all_zips$zip
rownames(zip_dists) <- all_zips$zip
#save
save(zip_dists, file = "location_data/zip_dists.RData")
```
```{r echo=FALSE}
load("location_data/all_zips.RData")
load("location_data/zip_dists.RData")
```
Now we need to remove brands that are duplicated within the same zip code and year. When this occurs it's usually one family or ranch that has registered a single brand multiple times for different locations on the animal. In our model, duplicated brand codes will correspond to variations of the same combination of symbols in the same order independently of the location on the animal.
```{r eval=FALSE}
#get concatenated brands with duplicate codes
concat_brands <- sapply(1:nrow(brands), function(x){
#get four components and the duplicate code
temp <- brands$brand[[x]][c(1:4, 6)]
#if duplicate code is missing them remove the NA
if(length(which(is.na(temp))) > 0){
temp <- temp[-which(is.na(temp))]
}
#return a concatenated version
paste0(temp, collapse = "")
})
#build data table to determine which rows are duplicated (accounting for zip code and year)
concat_brands <- data.table::data.table(brand = concat_brands, location = brands$location, year = brands$year)
#remove duplicated rows from the main data table
brands <- brands[-which(duplicated(concat_brands)),]
#remove temporary object
rm(concat_brands)
```
Now we need to turn our brand data into a numeric matrix so it is compatible with our ABM. In this matrix, brands will be stored as a vector of eight numbers, where the first four correspond to the components (with zeroes for empty component codes) and the last four correspond to the angles of rotation (with zeroes for unrotated components). For this, we will just replace each component code with a node denoting it's position in the `components` vector, and assign a zero to empty positions. We will also append zip codes and years.
Once brands are converted let's go ahead and save the object so we don't have to execute all of this code every time.
```{r eval=FALSE}
#create empty matrix for converted brands
converted_brands <- matrix(0, nrow = nrow(brands), ncol = 10)
#append zip codes and years
converted_brands[, 9] <- as.numeric(brands$location)
converted_brands[, 10] <- as.numeric(brands$year)
#iterate through the brands
for(i in 1:nrow(brands)){
#extract the index numbers of components (ignoring rotation)
brand_nums <- match(all_poss_components$index[match(brands$brand[[i]][1:4], all_poss_components$rot)], components)
#store eventual numeric order of components
comp_order <- order(brand_nums)
#replace empty values with 0
brand_nums[is.na(brand_nums)] <- 0
#create empty vector of angles
angle_nums <- rep(0, 4)
#iterate through the components
for(j in 1:length(brand_nums[!is.na(brand_nums)])){
#if the component is not a letter
#check if there are any characters that are different between the actual component and it's index (indicates rotation)
temp <- as.numeric(setdiff(strsplit(all_poss_components$rot[match(brands$brand[[i]][j], all_poss_components$rot)], split = "")[[1]],
strsplit(all_poss_components$index[match(brands$brand[[i]][j], all_poss_components$rot)], split = "")[[1]]))
#if there are, replace the corresponding 0 in the vector of angles
if(length(temp) > 0){angle_nums[j] <- temp}
}
#store numeric brands and angles in the matrix (components sorted in ascending numeric order, and angles re-ordered accordingly)
converted_brands[i, 1:4] <- brand_nums[comp_order]
converted_brands[i, 5:8] <- angle_nums[comp_order]
#remove temporary objects
rm(list = c("brand_nums", "comp_order", "angle_nums", "temp"))
}
#rewrite brands and remove original brands
brands <- converted_brands
rm(converted_brands)
#save
save(brands, file = "converted_brands.RData")
```
```{r echo=FALSE}
load("converted_brands.RData")
```
Here's a sample of what this matrix looks like.
```{r}
brands[1:10,]
```
And here is the distribution of brands in terms of numbers of components.
```{r}
table(sapply(1:nrow(brands), function(x){length(which(brands[x, 1:4] != 0))}))
```
```{r echo = FALSE}
#create data table of counties
counties <- tigris::list_counties("KS")
counties <- data.table::data.table(county = counties$county, quadrant = rep(NA, nrow(counties)))
#NW counties
counties$quadrant[which(counties$county %in% c("Cheyenne", "Rawlins", "Decatur",
"Norton", "Phillips", "Smith",
"Sherman", "Thomas", "Sheridan",
"Graham", "Rooks", "Osborne",
"Wallace", "Logan", "Gove",
"Trego", "Ellis", "Russell"))] <- "NW"
#SW counties
counties$quadrant[which(counties$county %in% c("Greeley", "Wichita", "Scott",
"Lane", "Ness", "Rush", "Barton",
"Hamilton", "Kearny", "Finney",
"Hodgeman", "Pawnee", "Stafford",
"Stanton", "Grant", "Haskell",
"Gray", "Ford", "Edwards",
"Kiowa", "Pratt", "Morton",
"Stevens", "Seward", "Meade",
"Clark", "Comanche", "Barber"))] <- "SW"
#NE counties
counties$quadrant[which(counties$county %in% c("Jewell", "Republic",
"Washington", "Marshall",
"Nemaha", "Brown", "Doniphan",
"Mitchell", "Cloud", "Clay",
"Riley", "Pottawatomie",
"Jackson", "Atchison",
"Jefferson", "Leavenworth",
"Wyandotte", "Lincoln", "Ottawa",
"Ellsworth", "Saline",
"Dickinson", "Geary", "Morris",
"Wabaunsee", "Shawnee", "Osage",
"Douglas", "Franklin", "Johnson",
"Miami"))] <- "NE"
#SE counties
counties$quadrant[which(counties$county %in% c("Rice", "McPherson", "Marion",
"Chase", "Lyon", "Coffey",
"Anderson", "Linn", "Reno",
"Harvey", "Kingman", "Sedgwick",
"Butler", "Greenwood", "Woodson",
"Allen", "Bourbon", "Elk",
"Wilson", "Neosho", "Crawford",
"Harper", "Sumner", "Cowley",
"Chautauqua", "Montgomery",
"Labette", "Cherokee"))] <- "SE"
#add zip codes to data table
data("zip_code_db", package = "zipcodeR")
counties$zip <- lapply(1:nrow(counties), function(x){zipcodeR::search_county(counties$county[x], state_abb = "KS")$zipcode})
#create a thesaurus of which zip codes are in which quadrants
quad_inds <- c()
quad_zips <- c()
for(i in 1:nrow(counties)){
quad_inds <- c(quad_inds, rep(counties$quadrant[i], length(counties$zip[[i]])))
quad_zips <- c(quad_zips, counties$zip[[i]])
}
quad_thes <- data.frame(quadrant = quad_inds, zip = quad_zips)
rm(list = c("quad_inds", "quad_zips"))
#get quadrant for each brand in the dataset
brands_quadrants <- quad_thes$quadrant[match(brands[, 9], quad_thes$zip)]
#get number of components per brand
n_components <- sapply(1:nrow(brands), function(x){4 - length(which(brands[x, 1:4] == 0))})
#get number of two- and three-component brand that are unique to 1990 (old) or unique to the later years (young)
#first, for each unique brand code (discounting location), we need the years in which it appears
concat_brands <- as.factor(sapply(1:nrow(brands), function(x){paste0(brands[x, 1:8], collapse = " ")}))
unique_concat_brands <- levels(concat_brands)
years_appear <- lapply(unique_concat_brands, function(x){unique(brands[which(concat_brands == x), 10])})
#now, we can get which brands are either only old or only young
which_old <- sapply(1:length(concat_brands), function(x){all(years_appear[[which(unique_concat_brands == concat_brands[x])]] == 1990)})
which_young <- sapply(1:length(concat_brands), function(x){all(years_appear[[which(unique_concat_brands == concat_brands[x])]] != 1990)})
#what percent of component types are singletons
all_components <- c(brands[, 1:4])
all_components <- all_components[-which(all_components == 0)]
percent_singleton <- round((length(which(as.numeric(table(all_components)) == 1))/length(as.numeric(table(all_components))))*100, 2)
```
The total number of brands in the dataset, after removing duplicates within zip codes, is `r scales::comma(nrow(brands))`. For brands with two components, `r scales::comma(length(which(n_components == 2 & which_old)))` of them only appear in 1990 ("old") and `r scales::comma(length(which(n_components == 2 & which_young)))` of them only appear in 2008-2016 ("young"). For brands with three components, `r scales::comma(length(which(n_components == 3 & which_old)))` of them are old and `r scales::comma(length(which(n_components == 3 & which_young)))` of them are young. Only `r round((length(which(n_components == 4))/length(n_components))*100, 2)`% of brands have four components, and only `r percent_singleton`% of component types are singletons (i.e. only appear once). Here is a further breakdown of the brands in terms of the four geographic quadrants described in the preregistration document, for each combination of age and complexity:
```{r echo = FALSE, eval = FALSE}
#create function for removing singletons from subset
singleton_removal <- function(matrix){
to_remove <- which(sapply(1:nrow(matrix), function(x){length(which(matrix[x, 1:4] %in% as.numeric(names(which(table(c(matrix[, 1:4])) == 1))))) > 0}))
return(matrix[-to_remove, ])
}
#set random seed
set.seed(12345)
#create structured and mixed datasets
o_ne_2 <- brands[which(which_old & brands_quadrants == "NE" & n_components == 2), ]
o_ne_2_mixed <- brands[sample(which(brands_quadrants == "NE" & n_components == 2), nrow(o_ne_2)), ]
o_ne_3 <- brands[which(which_old & brands_quadrants == "NE" & n_components == 3), ]
o_ne_3_mixed <- brands[sample(which(brands_quadrants == "NE" & n_components == 3), nrow(o_ne_3)), ]
y_ne_2 <- brands[which(which_young & brands_quadrants == "NE" & n_components == 2), ]
y_ne_2_mixed <- brands[sample(which(brands_quadrants == "NE" & n_components == 2), nrow(y_ne_2)), ]
y_ne_3 <- brands[which(which_young & brands_quadrants == "NE" & n_components == 3), ]
y_ne_3_mixed <- brands[sample(which(brands_quadrants == "NE" & n_components == 3), nrow(y_ne_3)), ]
o_nw_2 <- brands[which(which_old & brands_quadrants == "NW" & n_components == 2), ]
o_nw_2_mixed <- brands[sample(which(brands_quadrants == "NW" & n_components == 2), nrow(o_nw_2)), ]
o_nw_3 <- brands[which(which_old & brands_quadrants == "NW" & n_components == 3), ]
o_nw_3_mixed <- brands[sample(which(brands_quadrants == "NW" & n_components == 3), nrow(o_nw_3)), ]
y_nw_2 <- brands[which(which_young & brands_quadrants == "NW" & n_components == 2), ]
y_nw_2_mixed <- brands[sample(which(brands_quadrants == "NW" & n_components == 2), nrow(y_nw_2)), ]
y_nw_3 <- brands[which(which_young & brands_quadrants == "NW" & n_components == 3), ]
y_nw_3_mixed <- brands[sample(which(brands_quadrants == "NW" & n_components == 3), nrow(y_nw_3)), ]
o_se_2 <- brands[which(which_old & brands_quadrants == "SE" & n_components == 2), ]
o_se_2_mixed <- brands[sample(which(brands_quadrants == "SE" & n_components == 2), nrow(o_se_2)), ]
o_se_3 <- brands[which(which_old & brands_quadrants == "SE" & n_components == 3), ]
o_se_3_mixed <- brands[sample(which(brands_quadrants == "SE" & n_components == 3), nrow(o_se_3)), ]
y_se_2 <- brands[which(which_young & brands_quadrants == "SE" & n_components == 2), ]
y_se_2_mixed <- brands[sample(which(brands_quadrants == "SE" & n_components == 2), nrow(y_se_2)), ]
y_se_3 <- brands[which(which_young & brands_quadrants == "SE" & n_components == 3), ]
y_se_3_mixed <- brands[sample(which(brands_quadrants == "SE" & n_components == 3), nrow(y_se_3)), ]
o_sw_2 <- brands[which(which_old & brands_quadrants == "SW" & n_components == 2), ]
o_sw_2_mixed <- brands[sample(which(brands_quadrants == "SW" & n_components == 2), nrow(o_sw_2)), ]
o_sw_3 <- brands[which(which_old & brands_quadrants == "SW" & n_components == 3), ]
o_sw_3_mixed <- brands[sample(which(brands_quadrants == "SW" & n_components == 3), nrow(o_sw_3)), ]
y_sw_2 <- brands[which(which_young & brands_quadrants == "SW" & n_components == 2), ]
y_sw_2_mixed <- brands[sample(which(brands_quadrants == "SW" & n_components == 2), nrow(y_sw_2)), ]
y_sw_3 <- brands[which(which_young & brands_quadrants == "SW" & n_components == 3), ]
y_sw_3_mixed <- brands[sample(which(brands_quadrants == "SW" & n_components == 3), nrow(y_sw_3)), ]
# #remove singletons from all of these
# o_ne_2 <- singleton_removal(o_ne_2)
# o_ne_2_mixed <- singleton_removal(o_ne_2_mixed)
# o_ne_3 <- singleton_removal(o_ne_3)
# o_ne_3_mixed <- singleton_removal(o_ne_3_mixed)
# y_ne_2 <- singleton_removal(y_ne_2)
# y_ne_2_mixed <- singleton_removal(y_ne_2_mixed)
# y_ne_3 <- singleton_removal(y_ne_3)
# y_ne_3_mixed <- singleton_removal(y_ne_3_mixed)
# o_nw_2 <- singleton_removal(o_nw_2)
# o_nw_2_mixed <- singleton_removal(o_nw_2_mixed)
# o_nw_3 <- singleton_removal(o_nw_3)
# o_nw_3_mixed <- singleton_removal(o_nw_3_mixed)
# y_nw_2 <- singleton_removal(y_nw_2)
# y_nw_2_mixed <- singleton_removal(y_nw_2_mixed)
# y_nw_3 <- singleton_removal(y_nw_3)
# y_nw_3_mixed <- singleton_removal(y_nw_3_mixed)
# o_se_2 <- singleton_removal(o_se_2)
# o_se_2_mixed <- singleton_removal(o_se_2_mixed)
# o_se_3 <- singleton_removal(o_se_3)
# o_se_3_mixed <- singleton_removal(o_se_3_mixed)
# y_se_2 <- singleton_removal(y_se_2)
# y_se_2_mixed <- singleton_removal(y_se_2_mixed)
# y_se_3 <- singleton_removal(y_se_3)
# y_se_3_mixed <- singleton_removal(y_se_3_mixed)
# o_sw_2 <- singleton_removal(o_sw_2)
# o_sw_2_mixed <- singleton_removal(o_sw_2_mixed)
# o_sw_3 <- singleton_removal(o_sw_3)
# o_sw_3_mixed <- singleton_removal(o_sw_3_mixed)
# y_sw_2 <- singleton_removal(y_sw_2)
# y_sw_2_mixed <- singleton_removal(y_sw_2_mixed)
# y_sw_3 <- singleton_removal(y_sw_3)
# y_sw_3_mixed <- singleton_removal(y_sw_3_mixed)
#save structured and mixed datasets
subsets <- list(o_ne_2 = o_ne_2, o_ne_2_mixed = o_ne_2_mixed, o_ne_3 = o_ne_3, o_ne_3_mixed = o_ne_3_mixed,
y_ne_2 = y_ne_2, y_ne_2_mixed = y_ne_2_mixed, y_ne_3 = y_ne_3, y_ne_3_mixed = y_ne_3_mixed,
o_nw_2 = o_nw_2, o_nw_2_mixed = o_nw_2_mixed, o_nw_3 = o_nw_3, o_nw_3_mixed = o_nw_3_mixed,
y_nw_2 = y_nw_2, y_nw_2_mixed = y_nw_2_mixed, y_nw_3 = y_nw_3, y_nw_3_mixed = y_nw_3_mixed,
o_se_2 = o_se_2, o_se_2_mixed = o_se_2_mixed, o_se_3 = o_se_3, o_se_3_mixed = o_se_3_mixed,
y_se_2 = y_se_2, y_se_2_mixed = y_se_2_mixed, y_se_3 = y_se_3, y_se_3_mixed = y_se_3_mixed,
o_sw_2 = o_sw_2, o_sw_2_mixed = o_sw_2_mixed, o_sw_3 = o_sw_3, o_sw_3_mixed = o_sw_3_mixed,
y_sw_2 = y_sw_2, y_sw_2_mixed = y_sw_2_mixed, y_sw_3 = y_sw_3, y_sw_3_mixed = y_sw_3_mixed)
save(subsets, file = "analysis/shuffling_model/subsets.RData")
```
```{r echo = FALSE}
#construct and print table of sample sizes for each combination of age, complexity, and quadrant
sample_table <- rbind(as.numeric(table(brands_quadrants[which(n_components == 2 & which_old)])),
as.numeric(table(brands_quadrants[which(n_components == 3 & which_old)])),
as.numeric(table(brands_quadrants[which(n_components == 2 & which_young)])),
as.numeric(table(brands_quadrants[which(n_components == 3 & which_young)])))
sample_table <- as.data.frame(sample_table)
colnames(sample_table) <- c("NE", "NW", "SE", "SW")
rownames(sample_table) <- c("2-comp old", "3-comp old", "2-comp young", "3-comp young")
sample_table
```
In short, we have ample sample size to conduct all of the analyses proposed in the preregistration document.
Finally, if a large number of brands are registered in cities, where ranches are unlikely to be located, then it could be indicative of deeper data quality issues. The four [most populous counties in Kansas](http://www.usa.com/rank/kansas-state--population-density--county-rank.htm) are Johnson, Wyandotte, Sedgwick, and Shawnee. Collectively, these counties include the three biggest metropolitan areas: Kansas City, Topeka, and Wichita.
```{r}
#collect zip codes for top four counties
johnson <- zipcodeR::search_county("Johnson", "KS")
wyandotte <- zipcodeR::search_county("Wyandotte", "KS")
sedgwick <- zipcodeR::search_county("Sedgwick", "KS")
shawnee <- zipcodeR::search_county("Shawnee", "KS")
#compile p. o. box and normal zips
po_box_zips <- c(johnson$zipcode[which(johnson$zipcode_type == "PO Box")],
wyandotte$zipcode[which(wyandotte$zipcode_type == "PO Box")],
sedgwick$zipcode[which(sedgwick$zipcode_type == "PO Box")],
shawnee$zipcode[which(shawnee$zipcode_type == "PO Box")])
normal_zips <- c(johnson$zipcode[which(johnson$zipcode_type == "Standard")],
wyandotte$zipcode[which(wyandotte$zipcode_type == "Standard")],
sedgwick$zipcode[which(sedgwick$zipcode_type == "Standard")],
shawnee$zipcode[which(shawnee$zipcode_type == "Standard")])
#get proportion of brands that are in metropolitan areas, and in p. o. boxes
prop_metro <- length(which(brands[, 9] %in% c(po_box_zips, normal_zips)))/nrow(brands)
prop_po_box <- length(which(brands[, 9] %in% po_box_zips))/nrow(brands)
```
Luckily, it appears that only `r round(prop_metro*100, digits = 2)`% of brands are registered in metropolitan areas, and only `r round(prop_po_box*100, digits = 2)`% are registered to metropolitan P. O. Boxes. This suggests that brand registration in cities is not a significant issue.
## Description of the ABM
The agent-based model simulates ranchers creating new brands every year based on the components (and optionally angles) present in the brands used by ranchers around them, as well as dual constraints of simplicity and complexity. For example, with the parameters in the ABM we can run a model where ranchers copy components at a large geographic scale while being as different as possible from their closest neighbors, and try to create brands that are complex enough that they aren't easily faked but simple enough that are easily legible.
When a new brand is created, the components and angles around it are sampled within two radii: one in which components and angles are more likely to be used (henceforth copying), and one in which components and angles are less likely to be used (henceforth distinctiveness). All of the components present in each radii are compiled into frequency tables that are used for weighted random sampling. Once components have been sampled, all of the rotations of each component that appear in each radii are compiled as well. The probability that a rancher uses a particular component (or angle) is based on the frequency of it within the copying radius, raised to an exponent $C$, and the inverse frequency of it within the distinctiveness radius, raised to an exponent $D$.
<!-- $$P(x) = F_x^C \times \left(\frac{1}{F_x}\right)^D$$ -->
<center><img src="https://render.githubusercontent.com/render/math?math=P(x)%20=%20F_x^C%20%5Ctimes%20%5Cleft(%5Cfrac{1}{F_x}%5Cright)^D"></center>
$C$ and $D$ control the strength of copying and distinctiveness in the probability of adopting components and angles, where zero is neutrality, one is proportional to their observed frequencies, and values greater than one increase their influence beyond their observed frequencies. When only a subset of angles are possible for a particular component then only the frequencies of those particular angles are considered.
The map below shows an example of copying and distinctiveness radii (200 km and 100 km, respectively) around a target zip code, shown by the green triangle. In this example, components and angles within the larger copying radius (blue) would be more likely to be appear in the target zip code, whereas brands within the smaller distinctiveness radius (red) would be less likely to be appear. Here we plot the geographic centroids of each zip code rather than their boundaries, as the only shapefiles available at the zip code level are for the US Census' "zip code tabulation areas", which do not include all locations in the dataset.
```{r echo=FALSE, fig.align="center", dpi=300, out.width="75%", fig.width=5.5, fig.height=3}
#get zip codes within 200 km and 100 km of target location
copy_example <- names(which(zip_dists[, which(colnames(zip_dists) == 67427)] <= 200))
dist_example <- names(which(zip_dists[, which(colnames(zip_dists) == 67427)] <= 100))
kansas_map <- ggplot2::map_data("state", "kansas")
map_example <- data.frame(zip = all_zips$zip, lon = all_zips$lon, lat = all_zips$lat,
color = rep(0, length(all_zips$zip)),
shape = rep(0, length(all_zips$zip)))
map_example$color[which(map_example$zip %in% copy_example)] <- 1
map_example$color[which(map_example$zip %in% dist_example)] <- 2
map_example$color[which(map_example$zip == 67427)] <- 3
map_example$shape[which(map_example$zip == 67427)] <- 1
map_example$color <- as.factor(map_example$color)
map_example$shape <- as.factor(map_example$shape)
ggplot2::ggplot(kansas_map, ggplot2::aes(x = long, y = lat)) +
ggplot2::geom_point(map_example, mapping = ggplot2::aes(x = lon, y = lat, colour = color, shape = shape, size = shape)) +
ggplot2::geom_polygon(fill = "transparent", colour = "black") +
ggplot2::scale_colour_manual(values = c("grey90", "#0072B2", "#D55E00", "#009E73")) +
ggplot2::scale_shape_manual(values = c(16, 17)) +
ggplot2::scale_size_manual(values = c(2, 8)) +
ggplot2::coord_quickmap() +
ggplot2::theme_minimal() +
ggplot2::labs(x = "Longitude", y = "Latitude") +
ggplot2::theme(legend.position = "none")
#remove objects
rm(list = c("copy_example", "dist_example", "kansas_map", "map_example"))
```
To simulate simplicity and complexity, the number of components in each new brand is drawn from a Poisson distribution where $\lambda$ is the parameter of interest (henceforth complexity). Below is an example of the normalized probabilities of creating a brand with between one and four components when $\lambda$ is 0.5, 3, and 15. Note that the normalization here is only for plotting purposes - We will using the raw output of `dpois` as the `prob` argument of base R's `sample` function.
```{r echo = FALSE, fig.align="center", dpi=300, out.width="75%", fig.width=5, fig.height=3}
#create data frame for poisson example
pois_example <- data.frame(n_comps = c(1, 2, 3, 4),
probs = c(BBmisc::normalize(dpois(c(1, 2, 3, 4), 0.5), "range"),
BBmisc::normalize(dpois(c(1, 2, 3, 4), 3), "range"),
BBmisc::normalize(dpois(c(1, 2, 3, 4), 15), "range")),
lambda = c(rep(0.5, 4), rep(3, 4), rep(15, 4)))
#plot poisson example
ggplot2::ggplot(data = pois_example, ggplot2::aes(x = n_comps, y = probs, group = as.factor(lambda))) +
ggplot2::geom_line(ggplot2::aes(color = as.factor(lambda))) +
ggplot2::geom_point(ggplot2::aes(color = as.factor(lambda))) +
ggplot2::labs(x = "Number of Components", y = "Normalized Probabilities", color = "\u03bb") +
ggplot2::scale_color_manual(values = c("#0072B2", "#D55E00", "#009E73")) +
ggplot2::theme_linedraw()
#remove object
rm(pois_example)
```
At the beginning of each year in the ABM a set of `n_new` brands is created, where `n_new` is the average number of new brands that appear each year in the observed data. Each new brand is assigned a zip code, where the probability of each zip code is proportional to its frequency in all years of the observed data. After new brands are created a set of `n_old` random brands is removed, where `n_old` is the average number of brands that disappear each year in the observed data.
The ABM is initialized with the earliest year of observed data and runs through every year until the final year of observed data. In order to fit the parameters of the ABM to the observed data we also need to calculate a set of summary statistics that capture both the overall diversity and spatial diversity of components and brands. Right now the ABM collects the following summary statistics at the end of each year:
* For components:
- Overall diversity:
1. Proportion of components that are the most common type
2. Proportion of components that are the most rare type
3. Shannon's diversity index
4. Simpson's diversity index
- Spatial diversity:
1. Jaccard index of beta diversity (zip codes)
2. Morisita-Horn index of beta diversity (zip codes)
3. Jaccard index of beta diversity (counties)
4. Morisita-Horn index of beta diversity (counties)
* For brands:
- Overall diversity:
1. Mean Levenshtein distance (from random 10%)
For all diversity metrics calculate their Hill number counterparts, because they are [measured on the same scale](https://onlinelibrary.wiley.com/doi/10.1111/oik.07202) and [better account for relative abundance](https://esajournals.onlinelibrary.wiley.com/doi/abs/10.1890/13-0133.1). Shannon's diversity index emphasizes more rare types whereas Simpson's diversity index emphasizes more common types. The Jaccard and Morisita-Horn indices were similarly chosen for their complementarity. The Morisita-Horn index is a commonly used [abundance-based](https://onlinelibrary.wiley.com/doi/10.1111/j.1541-0420.2005.00489.x) beta diversity index, whereas the Jaccard index is the most robust of the [incidence-based](https://esajournals.onlinelibrary.wiley.com/doi/10.1002/ecs2.2100) beta diversity indices to sampling error. We calculate beta diversity at both the zip code and county-level to assess spatial diversity at two different resolutions. Diversity indices are not calculated at the level of brands, since duplicated brands in the model are not actually the same type (i.e. duplicates in the model and data correspond to slight variations of the same set of components that are only coded as the same). The mean Levenshtein distance (a.k.a. edit distance), or the [minimum number of insertions, deletions, and substitions required to convert one sequence to another](https://stat.ethz.ch/R-manual/R-devel/library/utils/html/adist.html), is calculated from a random 10% of brands (specified by the `edit_dist_prop` argument of the ABM function).
The ABM parameters will be fit to the observed data using the random forest version of approximate Bayesian computation (ABC). Random forest ABC is [robust to the number of summary statistics](https://academic.oup.com/bioinformatics/article/32/6/859/1744513) and even [ranks them according to their importance](https://academic.oup.com/bioinformatics/article/35/10/1720/5132692), which means that we will not have to [reduce the dimensionality](https://projecteuclid.org/journals/statistical-science/volume-28/issue-2/A-Comparative-Review-of-Dimension-Reduction-Methods-in-Approximate-Bayesian/10.1214/12-STS406.full) of our summary statistics prior to inference.
## Test of the ABM
The ABM functions are in the `cattlebrandABM.R` file. Please refer to the (heavily commented) functions in this file for details.
```{r}
source("cattlebrandABM.R")
```
Before test the ABM we need to calculate a couple of things - First the probability of rotation, which for now is just the proportion of brands in the full dataset that are rotated.
```{r}
#probability of rotation (proportion of rotated brands in the full dataset)
rot_prob <- as.numeric(1-(table(brands[, 5:8])[1]/sum(table(brands[, 5:8]))))
```
Now we can subset the brand data to (1) calculate the average number of new and old brands per year, and (2) have the first year of data separated to initialize the model with.
```{r}
#separate brands data by year
brands_1990 <- data.table::data.table(brands[which(brands[, 10] == 1990), 1:9])
brands_2008 <- data.table::data.table(brands[which(brands[, 10] == 2008), 1:9])
brands_2014 <- data.table::data.table(brands[which(brands[, 10] == 2014), 1:9])
brands_2015 <- data.table::data.table(brands[which(brands[, 10] == 2015), 1:9])
brands_2016 <- data.table::data.table(brands[which(brands[, 10] == 2016), 1:9])
#calculate average number of new brands that appear each year and old brands that disappear each year (fsetdiff gets rows of first that are not in second)
n_new <- mean(c((nrow(data.table::fsetdiff(brands_2008, brands_1990))/18),
(nrow(data.table::fsetdiff(brands_2014, brands_2008))/6),
nrow(data.table::fsetdiff(brands_2015, brands_2014)),
nrow(data.table::fsetdiff(brands_2016, brands_2015))))
n_old <- mean(c((nrow(data.table::fsetdiff(brands_1990, brands_2008))/18),
(nrow(data.table::fsetdiff(brands_2008, brands_2014))/6),
nrow(data.table::fsetdiff(brands_2014, brands_2015)),
nrow(data.table::fsetdiff(brands_2015, brands_2016))))
```
We also need to reshape components so it includes whether or not a component is rotatable (or includes at least one comma).
```{r}
#reshape components
components <- data.table::data.table(components = components, rotatable = rotation$rot)
components$rotatable[which(components$rotatable == "NA")] <- NA
components$rotatable <- sapply(1:nrow(components), function(x){as.numeric(components$rotatable[[x]])})
#save
save(components, file = "components.RData")
```
Eventually we can use the code below to set the limits of our prior for the sizes of the two radii.
```{r}
#get min and max distances between zip codes (to inform prior range)
round(min(zip_dists[which(zip_dists != 0)]))
round(max(zip_dists))
```
Okay, now we are ready to do a test run of the ABM. We'll set the complexity ($\lambda$) to 3, the copying radius to 200 km, the distinctive radius to 100 km, and the strength of both copying and distinctiveness to 1. The model will be initialized with the data from 1990, and summary statistics will be collected for 2008, 2014, 2015, and 2016. We'll also run one model that ignores the angles of components, and one that takes it into account
```{r}
#test out the components-only ABM (and get runtime)
start <- Sys.time()
components_only <- cattlebrandABM(init_brands = as.matrix(brands_1990), components, all_zips, zip_dists,
init_year = 1990, sampling_years = c(2008, 2014, 2015, 2016), n_new, n_old,
rot_prob, complexity = 3, copy_radius = 200, copy_strength = 1,
dist_radius = 100, dist_strength = 1, angles = FALSE, edit_dist_prop = 0.1)
Sys.time() - start
#print output
components_only
#test out the components and angles ABM (and get runtime)
start <- Sys.time()
components_angles <- cattlebrandABM(init_brands = as.matrix(brands_2008), components, all_zips, zip_dists,
init_year = 1990, sampling_years = c(2008, 2014, 2015, 2016), n_new, n_old,
rot_prob, complexity = 3, copy_radius = 200, copy_strength = 1,
dist_radius = 100, dist_strength = 1, angles = TRUE, edit_dist_prop = 0.1)
Sys.time() - start
#print output
components_angles
```
The output of each model is a matrix with a row for each of the four sampling years, and a column for each of the nine summary statistics collected in that year. After some profiling and optimization (using `profvis`) the runtime for the agent-based model is just barely fast enough for generative inference. I tried to further optimize the way that unique brands and component/angle combinations are handled by leaving them as matrices of integers instead of converting them into concatenated strings, but surprisingly the string method is significantly faster.
Now let's ensure that variation in our parameter values actually leads to variation in our summary statistics. To do this, we'll run 1,000 simulations of the model assuming the same radii as above but with varying parameter values for `complexity`, `copy_strength`, and `dist_strength`. By plotting each dynamic parameter value against the summary statistics that they generate, we can insure that variation in one corresponds to variation in the other.
```{r eval=FALSE}
#set number of simulations and cores
n_sims <- 1000
n_cores <- parallel::detectCores()-1
#get uniform priors for three main dynamic parameters
priors <- data.frame(complexity = runif(n_sims, min = 0.5, max = 15),
copy_strength = runif(n_sims, min = 0, max = 5),
dist_strength = runif(n_sims, min = 0, max = 5))
#run simulations
sum_stats <- parallel::mclapply(1:n_sims, function(x){cattlebrandABM(init_brands = as.matrix(brands_2008), components, all_zips, zip_dists,
init_year = 1990, sampling_years = c(2016), n_new, n_old,
rot_prob, complexity = priors$complexity[x], copy_radius = 200,
copy_strength = priors$copy_strength[x], dist_radius = 100,
dist_strength = priors$dist_strength[x], angles = FALSE, edit_dist_prop = 0.1)}, mc.cores = n_cores)
#simplify and save output
sum_stats <- do.call("rbind", sum_stats)
sim_test <- cbind(priors, sum_stats)
save(sim_test, file = "sim_test.RData")
```
In this plot, each row corresponds to the nine summary statistics and each column corresponds to the three dynamic parameters in the ABM (A: complexity, B: copying strength, C: distinctiveness strength).
```{r echo=FALSE, fig.align="center", fig.height=10, fig.width=3.5}
load("sim_test.RData")
plots_1 <- lapply(1:9, function(x){ggplot2::ggplot(sim_test, ggplot2::aes(sim_test[, 1], sim_test[, x+3])) + ggplot2::geom_point(size = 0.2) + ggplot2::theme_void() + ggplot2::theme(axis.title.y = ggplot2::element_text()) + ggplot2::labs(x = NULL, y = x)})
plots_2 <- lapply(1:9, function(x){ggplot2::ggplot(sim_test, ggplot2::aes(sim_test[, 2], sim_test[, x+3])) + ggplot2::geom_point(size = 0.2) + ggplot2::theme_void() + ggplot2::theme(axis.title.x = ggplot2::element_blank(), axis.title.y = ggplot2::element_blank())})
plots_3 <- lapply(1:9, function(x){ggplot2::ggplot(sim_test, ggplot2::aes(sim_test[, 3], sim_test[, x+3])) + ggplot2::geom_point(size = 0.2) + ggplot2::theme_void() + ggplot2::theme(axis.title.x = ggplot2::element_blank(), axis.title.y = ggplot2::element_blank())})
plots <- c(plots_1, plots_2, plots_3)
plots[[9]] <- plots[[9]] + ggplot2::theme(axis.title.x = ggplot2::element_text()) + ggplot2::labs(x = "A")
plots[[18]] <- plots[[18]] + ggplot2::theme(axis.title.x = ggplot2::element_text()) + ggplot2::labs(x = "B")
plots[[27]] <- plots[[27]] + ggplot2::theme(axis.title.x = ggplot2::element_text()) + ggplot2::labs(x = "C")
cowplot::plot_grid(plotlist = plots, nrow = 9, byrow = FALSE, label_x = c(1, 2, 3))
```