README.Rmd

---
title: "Agent-Based Model of Cattle Brands"
author: Mason Youngblood
output: github_document
---

The associated preregistration document can be found on [OSF](https://osf.io/d4qv3/), and a detailed description of the structure of the cattle brand data can be found on the [corresponding GitHub](https://github.com/masonyoungblood/cattle_brand_data).

```{r echo=FALSE, fig.align="center", out.width="50%"}
knitr::include_graphics("https://images.unsplash.com/photo-1590249351139-c6effec78d2a?ixid=MnwxMjA3fDB8MHxzZWFyY2h8MjV8fGJhYnklMjBjb3dzfGVufDB8fDB8fA%3D%3D&ixlib=rb-1.2.1&w=1000&q=80")
```

## Load and prepare brands data

First let's read in and clean up our brand data.

```{r}
#read in brand data
brands <- read.csv("brand_data.csv")[, -1]

#remove brands with length of 12 (incorrectly specified according to manual inspection)
brands <- brands[-which(nchar(brands$brand) == 12), ]

#substring brand codes into vector of four components and one location
brands$brand <- lapply(1:nrow(brands), function(x){substring(brands$brand[[x]], first = c(1, 4, 7, 10, 13), last = c(3, 6, 9, 12, 13))})

#print a sample
brands[1:10, ]
```
Each three digit code corresponds to a component in the brand, and the final single digit corresponds to the location of the brand on the animal. Note that the extra commas at the end of each brand component are just part of the data frame output. Some brands have a single digit in the 12th digit before the location digit to separate duplicates, where the same components are arranged in different ways. Let's extract those and put them into the sixth position of the brand vectors.

```{r eval=FALSE}
#move duplicate code (12th position) into the sixth position of the brand vectors
for(x in 1:nrow(brands)){
  #if fourth position starts with a comma and ends with a number (or O)
  if(substring(brands$brand[[x]][4], 1, 1) == "," & substring(brands$brand[[x]][4], 3) %in% c("O", 0:9)){
    #put duplicate number into the sixth position
    brands$brand[[x]][6] <- gsub(",", "", brands$brand[[x]][4])
    
    #replace any O with 0
    brands$brand[[x]][6] <- gsub("O", 0, brands$brand[[x]][6])
    
    #replace original fourth position with ",,,"
    brands$brand[[x]][4] <- ",,,"
  }
}

#save
save(brands, file = "raw_brands.RData")
```

```{r echo=FALSE}
load("raw_brands.RData")
```

Before we move any further we should determine whether letters in components tend to be initials or whether they should be treated as distinct. Here are the frequencies of letters in the brands.

```{r echo=FALSE, fig.align="center"}
#construct empty table for frequencies of letters in brands
brand_freq <- data.frame(letter = LETTERS, count = rep(0, length(letters)))

#for each brand
for(i in 1:nrow(brands)){
  #get components that match letters (only first two characters since angles haven't been extracted yet)
  matches <- which(substr(brands$brand[[i]], 1, 2) %in% paste0(LETTERS, ","))
  
  #if there are matches
  if(length(matches) > 0){
    #store frequency table of them temporarily
    temp <- table(substr(brands$brand[[i]], 1, 2)[matches])
    
    #add those frequencies to the existing counts
    brand_freq$count[match(names(temp), paste0(LETTERS, ","))] <- brand_freq$count[match(names(temp), paste0(LETTERS, ","))]+as.numeric(temp)
    
    rm(temp)
  }
  
  rm(matches)
}

#plot brand data
ggplot2::ggplot(data = brand_freq, ggplot2::aes(x = letter, y = count)) + 
  ggplot2::geom_bar(stat = "identity") + 
  ggplot2::theme_linedraw() + 
  ggplot2::labs(x = "Letter", y = "Frequency", title = "Brands")

rm(brand_freq)
```

If we compare this with the distribution of first letters of surnames that appeared more than 100 times in the 2010 US Census we can see that the distributions are drastically different.

```{r echo=FALSE, fig.align="center"}
#load in all surnames that appeared more than 100 times in the 2010 US Census
census_names <- read.csv("Names_2010Census.csv")

#replace "(S)" in the census table with 0
breakdown_matrix <- as.matrix(census_names[, 6:11])
breakdown_matrix[which(breakdown_matrix == "(S)")] <- 0
census_names[, 6:11] <- breakdown_matrix

#replace census table with adjusted counts by race
census_names <- data.frame(letter = substr(census_names$name, 1, 1),
                           num_white = census_names$count*as.numeric(census_names$pctwhite),
                           num_black = census_names$count*as.numeric(census_names$pctblack),
                           num_api = census_names$count*as.numeric(census_names$pctapi),
                           num_na = census_names$count*as.numeric(census_names$pctaian),
                           num_mult = census_names$count*as.numeric(census_names$pct2prace),
                           num_hisp = census_names$count*as.numeric(census_names$pcthispanic))

#aggregate by letter
census_names <- aggregate(. ~ letter, data = census_names, sum)

#load in target racial breakdown for KS, which does not add up to 100% because of overlap (white, black, asian + pacific islander, native american, 2+, hispanic): https://www.census.gov/quickfacts/KS
ks_breakdown <- 2913314*c(0.863, 0.061, 0.033, 0.012, 0.031, 0.122)

#construct data frame for plotting
letter_data <- data.frame(letter = LETTERS,
                          name_freq = sapply(1:nrow(census_names), function(x){sum(census_names[x, 2:7])}),
                          adj_name_freq = sapply(1:nrow(census_names), function(x){sum(ks_breakdown/census_names[x, 2:7])}))

#plot name data
ggplot2::ggplot(data = letter_data, ggplot2::aes(x = letter, y = name_freq)) + 
  ggplot2::geom_bar(stat = "identity") + 
  ggplot2::theme_linedraw() + 
  ggplot2::labs(x = "Letter", y = "Frequency", title = "Surnames")

#plot adjusted name data
#ggplot2::ggplot(data = letter_data, ggplot2::aes(x = letter, y = adj_name_freq)) + 
#  ggplot2::geom_bar(stat = "identity") + 
#  ggplot2::theme_linedraw() + 
#  ggplot2::labs(x = "Letter", y = "Adjusted Frequency", title = "Surnames")

rm(list = c("census_names", "breakdown_matrix", "ks_breakdown", "letter_data"))
```

```{r eval=FALSE, echo=FALSE}
#get booleans for brands that contain letters
has_letter <- sapply(1:nrow(brands[which(brands$year == 2016), ]), function(x){
  length(which(substr(brands[which(brands$year == 2016), ]$brand[[x]], 1, 2) %in% paste0(LETTERS, ","))) > 0
})

#randly sample 500 of these to check manually
letters_check <- brands[which(brands$year == 2016), ][sample(which(has_letter), 200, replace = FALSE), ]

#collapse brand codes for checking, some will be different from the book since we've already moved the duplicate indicator to the final position
letters_check$brand <- sapply(1:nrow(letters_check), function(x){paste0(letters_check$brand[[x]], collapse = "")})

#save
write.csv(letters_check, file = "letters_check.csv")
```

```{r}
#load in the checked brands with letters and get the proportion correct
prop_correct <- mean(read.csv("letters_checked.csv")$initials)*100
```

Weighting the US surname distribution so that it matches the ethnic composition of Kansas does not solve the issue, so we manually checked 200 random brands with letters in the 2016 brand book. In `r prop_correct`% of the random brands at least one of the letters corresponded to one or both of the first or last names of the owners or the ranch (with some letters modifying others, like A and L both appearing in a brand where the L is a "leg" on the right side of the A in a brand owned by the family Asmussen, and some characters combined to make a letter, like L and 7 combined into an S for Schneider). 

Since a significant number of letters in brands are not initials, some letters (e.g. I and O) are also used to code shapes, and different letters have different possible rotation angles, we chose to keep them as separate categories. Numbers were also included as separate categories. Some of the symbols in the brand book have variants that are distinguished using the third digit that is usually used for rotation (e.g. AR, BB, MI, TK, and TR). We collapsed each set of these symbols into single categories that can be rotated just like any other component.

We made some further adjustments to Kansas' coding scheme by removing redundant components and restricting the possible angles of rotations for different symbols. All of these adjustments are stored in `components.xlsx`: the `collapse` sheet has which components should be converted to what along with their new angles, and the `rotation` sheet has which components can be rotated in which directions. For clarification, the possible rotation angles for each component are the first unique rotations for that shape. For example, the 1, 3, and 4 rotations for the letter A are all identical (upside down) so 1 is kept as the first unique rotation. The `rotation` sheet also has a column for each angle (1 to 9), where the value for each component is whether that angle is correct if it appears (NA), needs to be corrected to no rotation (0), or needs to be corrected to a different rotation (1 to 9).

All letter variations were converted to single categories. For example, A1 and A2, two stylistic variations of A with square and rounded tops, were both converted to A. The letter I is used primarily for vertical lines, so the other single line symbols (i.e. /, \\, and -) were converted to I (with their associated angles). 1, which could be easily mistaken for I, does not appear in the coding scheme. Double and triple lines are stored as separate symbols (i.e. = and -3). We chose to keep these as separate categories rather than convert them to multiple repetitions of I, as reducing components to their constituent parts introduces many other issues and could inflate the measured complexity (or number of components) of some brands.

), QC, and QC1 were all converted to ( (with their associated angles). ], BX1, and BX2 were all converted to [ (with their associated angles). DI (diamond) was converted to BX (box) with the associated angle. All of the ~ components (i.e. ~1 through ~6), which correspond to pairs of "squiggles" or "wings" that modify other components, do not seem to have systematic differences or angles and were combined into a single category without rotation.

MU, an unused code for music notes, was converted to NT, a code for notation that appears in the brand books. ^ was converted to A3, a code used for the same symbol in the brand books.

Several other simplifications were considered (with angles), such as converting + to X, 9 to 6, W to M, etc. For now we chose to keep these components separate.

```{r}
#load in adjustments from components.xlsx
collapse <- data.table::as.data.table(readxl::read_excel(paste0(dirname(rstudioapi::getSourceEditorContext()$path), "/", "components.xlsx"), sheet = "collapse"))
rotation <- data.table::as.data.table(readxl::read_excel(paste0(dirname(rstudioapi::getSourceEditorContext()$path), "/", "components.xlsx"), sheet = "rotation"))

#reformat rotation
rotation$rot <- strsplit(rotation$rot, ", ")
```

```{r eval=FALSE}
#for each brand
for(i in 1:nrow(brands)){
  #check if there are characters that need replacing (either three digit characters like BX1 and QC1 or substringed characters without commas)
  temp <- which(brands$brand[[i]][1:4] %in% collapse$from | gsub(",", "", substr(brands$brand[[i]][1:4], 1, 2)) %in% collapse$from)
  
  #and if there are
  if(length(temp) > 0){
    #go through them
    for(j in 1:length(temp)){
      #if the full three-digit component is in the data table (so something like BX1, QC1, etc.)
      if(brands$brand[[i]][temp[j]] %in% collapse$from){
        #store the index of the component
        temp_2 <- which(collapse$from == brands$brand[[i]][temp[j]])
        
        #and either replace it without or with rotation
        if(collapse$rot[temp_2] == "NA"){
          brands$brand[[i]][temp[j]] <- stringr::str_pad(collapse$to[temp_2], width = 3, side = "right", pad = ",")
        } else{
          brands$brand[[i]][temp[j]] <- paste0(stringr::str_pad(collapse$to[temp_2], width = 2, side = "right", pad = ","), collapse$rot[temp_2])
        }
      } else{ #if it's less than three digits
        #store the index of the component
        temp_2 <- which(collapse$from == gsub(",", "", substr(brands$brand[[i]][temp[j]], 1, 2)))
        
        #and either replace it without or with rotation
        if(collapse$rot[temp_2] == "NA"){
          brands$brand[[i]][temp[j]] <- stringr::str_pad(collapse$to[temp_2], width = 3, side = "right", pad = ",")
        } else{
          brands$brand[[i]][temp[j]] <- paste0(stringr::str_pad(collapse$to[temp_2], width = 2, side = "right", pad = ","), collapse$rot[temp_2])
        }
      }
      
      rm(temp_2)
    }
  }
  
  rm(temp)
}
```

To determine which angles are possible for each symbol we went through and assigned each one a five-digit binary code, where each digit corresponds to whether (0 or 1) each symbol matches itself when flipped horizontally, flipped vertically, rotated 90 degrees, rotated 180 degrees, and rotated 45 degrees. Then, for each five-digit code, we determined which rotation angles were possible. Some extra rotation angles were manually removed for character pairs like + and X, 9 and 6, and W and M, and some rotations angles had to be corrected when the binary code didn't quite capture the symbol, such as for %.

Now let's run through and correct any rotated components where the angle does not match the vector of first unique rotation angles for that component.

```{r eval=FALSE}
#for each brand
for(i in 1:nrow(brands)){
  #check if there are rotated components
  temp <- which(substr(brands$brand[[i]][1:4], 3, 3) %in% c(1:9))
  
  #if there are
  if(length(temp) > 0){
    #go through them
    for(j in 1:length(temp)){
      #store the index of the component
      temp_2 <- which(rotation$component == gsub(",", "", substr(brands$brand[[i]][temp[j]], 1, 2)))
      
      #store the current angle of rotation
      angle <- substr(brands$brand[[i]][temp[j]], 3, 3)
      
      #if the rotated component matches a real component (checking since we haven't removed misspecified components yet)
      if(length(temp_2) > 0){
        #and the rotation angle is not in the first unique rotation angles for that component
        if(!(angle %in% rotation$rot[[temp_2]])){
          #and the new rotation is not zero
          if(as.numeric(rotation[temp_2, ..angle]) > 0){
            #replace the current rotated component with a corrected version
            brands$brand[[i]][temp[j]] <- paste0(substr(brands$brand[[i]][temp[j]], 1, 2), as.numeric(rotation[temp_2, ..angle]))
          } else{ #if the new rotation is zero
            #remove the rotation entirely
            brands$brand[[i]][temp[j]] <- paste0(substr(brands$brand[[i]][temp[j]], 1, 2), ",")
          }
          
        }
      }
      
      rm(list = c("temp_2", "angle"))
    }
  }
  
  rm(temp)
}
```

Now we need to store the brand components. These include the components listed in the brand book indices, as well as some additions that appear in the real data (i.e. "TX,"). The "UN," component was excluded as it corresponds to unidentifiable symbols.

```{r}
#store components with commas
components <- stringr::str_pad(rotation$component, width = 3, side = "right", pad = ",")

#print them
components
```

We also need to generate all possible components, accounting for rotation, alongside index components in a data frame so rotated versions can be easily converted to non-rotated versions later on. Since we have already corrected incorrect angles we will only allow for angles that appear in the vector of first unique rotation angles for that component.

```{r eval=FALSE}
#create empty vectors to fill
all_poss_components <- c()
index_components <- c()

#iterate through components
for(x in 1:length(components)){
  #if the component is not rotatable
  if(length(which(rotation$rot[[x]] == "NA")) > 0){
    #add it without rotation
    all_poss_components <- c(all_poss_components, components[x])
    index_components <- c(index_components, components[x])
  } else{
    #add it with rotation
    all_poss_components <- c(all_poss_components, components[x], paste0(substr(components[x], 1, 2), rotation$rot[[x]]))
    index_components <- c(index_components, components[x], rep(components[x], length(paste0(substr(components[x], 1, 2), rotation$rot[[x]]))))
  }
}

#combine new vectors into a data frame and remove old variable
all_poss_components <- data.frame(index = index_components, rot = all_poss_components)
rm(index_components)
```

Now let's remove any misspecified brands with components that don't appear in `all_poss_components`.

```{r eval=FALSE}
#remove all misspecified brands (with components that aren't letters or don't appear in all possible components)
misspecified <- which(sapply(1:nrow(brands), function(x){
  length(which(brands$brand[[x]][1:4] %in% c(all_poss_components$rot, ",,,")))
}) != 4)
brands <- brands[-misspecified,]
rm(misspecified)
```

Before we move any further, we should go ahead and retrieve our geographic data and subset our brands to only include those registered in the state of Kansas.

```{r eval=FALSE}
#get all kansas zip codes
data("zip_code_db", package = "zipcodeR")
zip_code_db <- zip_code_db[which(zip_code_db$state == "KS"), ]

#identify zip codes with missing location data
missing_locations <- which(is.na(zip_code_db$lat))

#save zip codes with missing locations, to manually fill as CSV file outside of R using Google Maps
#sink("location_data/missing_zips.txt")
#cat(zip_code_db$zipcode[missing_locations], sep = "\n")
#sink()

#add in found locations
found <- read.csv("location_data/missing_zips_found.txt", header = FALSE)
zip_code_db$lat[missing_locations] <- found$V2
zip_code_db$lng[missing_locations] <- found$V3

#construct all_zips, a data table with zip codes, counties, latitudes, and longitudes
all_zips <- data.table::data.table(zip = as.numeric(zip_code_db$zipcode), county = as.factor(zip_code_db$county), lat = zip_code_db$lat, lon = zip_code_db$lng)

#save
save(all_zips, file = "location_data/all_zips.RData")

#subset brands to only include those from kansas
brands <- brands[which(brands$location %in% all_zips$zip), ]

#remove temporary objects
rm(list = c("zip_code_db", "missing_locations", "found"))
```

For our agent-based model (ABM) we will also need the pairwise geodesic distances between all of the zip codes in Kansas, so let's go ahead and collect that (and save the object after, because it takes a while).

```{r eval=FALSE}
#calculate pairwise geodesic distances (in kilometers) between zip codes
zip_dists <- geodist::geodist(all_zips[, 3:4], measure = "geodesic")/1000

#add row and column names
colnames(zip_dists) <- all_zips$zip
rownames(zip_dists) <- all_zips$zip

#save
save(zip_dists, file = "location_data/zip_dists.RData")
```

```{r echo=FALSE}
load("location_data/all_zips.RData")
load("location_data/zip_dists.RData")
```

Now we need to remove brands that are duplicated within the same zip code and year. When this occurs it's usually one family or ranch that has registered a single brand multiple times for different locations on the animal. In our model, duplicated brand codes will correspond to variations of the same combination of symbols in the same order independently of the location on the animal.

```{r eval=FALSE}
#get concatenated brands with duplicate codes
concat_brands <- sapply(1:nrow(brands), function(x){
  #get four components and the duplicate code
  temp <- brands$brand[[x]][c(1:4, 6)]
  
  #if duplicate code is missing them remove the NA
  if(length(which(is.na(temp))) > 0){
    temp <- temp[-which(is.na(temp))]
  }
  
  #return a concatenated version
  paste0(temp, collapse = "")
})

#build data table to determine which rows are duplicated (accounting for zip code and year)
concat_brands <- data.table::data.table(brand = concat_brands, location = brands$location, year = brands$year)

#remove duplicated rows from the main data table
brands <- brands[-which(duplicated(concat_brands)),]

#remove temporary object
rm(concat_brands)
```

Now we need to turn our brand data into a numeric matrix so it is compatible with our ABM. In this matrix, brands will be stored as a vector of eight numbers, where the first four correspond to the components (with zeroes for empty component codes) and the last four correspond to the angles of rotation (with zeroes for unrotated components). For this, we will just replace each component code with a node denoting it's position in the `components` vector, and assign a zero to empty positions. We will also append zip codes and years.

Once brands are converted let's go ahead and save the object so we don't have to execute all of this code every time.

```{r eval=FALSE}
#create empty matrix for converted brands
converted_brands <- matrix(0, nrow = nrow(brands), ncol = 10)

#append zip codes and years
converted_brands[, 9] <- as.numeric(brands$location)
converted_brands[, 10] <- as.numeric(brands$year)

#iterate through the brands
for(i in 1:nrow(brands)){
  #extract the index numbers of components (ignoring rotation)
  brand_nums <- match(all_poss_components$index[match(brands$brand[[i]][1:4], all_poss_components$rot)], components)
  
  #store eventual numeric order of components
  comp_order <- order(brand_nums)
  
  #replace empty values with 0
  brand_nums[is.na(brand_nums)] <- 0
  
  #create empty vector of angles
  angle_nums <- rep(0, 4)
  
  #iterate through the components
  for(j in 1:length(brand_nums[!is.na(brand_nums)])){
    #if the component is not a letter
    #check if there are any characters that are different between the actual component and it's index (indicates rotation)
    temp <- as.numeric(setdiff(strsplit(all_poss_components$rot[match(brands$brand[[i]][j], all_poss_components$rot)], split = "")[[1]],
                               strsplit(all_poss_components$index[match(brands$brand[[i]][j], all_poss_components$rot)], split = "")[[1]]))
    
    #if there are, replace the corresponding 0 in the vector of angles
    if(length(temp) > 0){angle_nums[j] <- temp}
  }
  
  #store numeric brands and angles in the matrix (components sorted in ascending numeric order, and angles re-ordered accordingly)
  converted_brands[i, 1:4] <- brand_nums[comp_order]
  converted_brands[i, 5:8] <- angle_nums[comp_order]
  
  #remove temporary objects
  rm(list = c("brand_nums", "comp_order", "angle_nums", "temp"))
}

#rewrite brands and remove original brands
brands <- converted_brands
rm(converted_brands)

#save
save(brands, file = "converted_brands.RData")
```

```{r echo=FALSE}
load("converted_brands.RData")
```

Here's a sample of what this matrix looks like.

```{r}
brands[1:10,]
```

And here is the distribution of brands in terms of numbers of components.

```{r}
table(sapply(1:nrow(brands), function(x){length(which(brands[x, 1:4] != 0))}))
```

```{r echo = FALSE}
#create data table of counties
counties <- tigris::list_counties("KS")
counties <- data.table::data.table(county = counties$county, quadrant = rep(NA, nrow(counties)))

#NW counties
counties$quadrant[which(counties$county %in% c("Cheyenne", "Rawlins", "Decatur",
                                               "Norton", "Phillips", "Smith",
                                               "Sherman", "Thomas", "Sheridan",
                                               "Graham", "Rooks", "Osborne",
                                               "Wallace", "Logan", "Gove",
                                               "Trego", "Ellis", "Russell"))] <- "NW"

#SW counties
counties$quadrant[which(counties$county %in% c("Greeley", "Wichita", "Scott",
                                               "Lane", "Ness", "Rush", "Barton",
                                               "Hamilton", "Kearny", "Finney",
                                               "Hodgeman", "Pawnee", "Stafford",
                                               "Stanton", "Grant", "Haskell",
                                               "Gray", "Ford", "Edwards",
                                               "Kiowa", "Pratt", "Morton",
                                               "Stevens", "Seward", "Meade",
                                               "Clark", "Comanche", "Barber"))] <- "SW"

#NE counties
counties$quadrant[which(counties$county %in% c("Jewell", "Republic",
                                               "Washington", "Marshall",
                                               "Nemaha", "Brown", "Doniphan",
                                               "Mitchell", "Cloud", "Clay",
                                               "Riley", "Pottawatomie", 
                                               "Jackson", "Atchison",
                                               "Jefferson", "Leavenworth",
                                               "Wyandotte", "Lincoln", "Ottawa",
                                               "Ellsworth", "Saline",
                                               "Dickinson", "Geary", "Morris",
                                               "Wabaunsee", "Shawnee", "Osage",
                                               "Douglas", "Franklin", "Johnson",
                                               "Miami"))] <- "NE"

#SE counties
counties$quadrant[which(counties$county %in% c("Rice", "McPherson", "Marion",
                                               "Chase", "Lyon", "Coffey",
                                               "Anderson", "Linn", "Reno",
                                               "Harvey", "Kingman", "Sedgwick",
                                               "Butler", "Greenwood", "Woodson",
                                               "Allen", "Bourbon", "Elk",
                                               "Wilson", "Neosho", "Crawford",
                                               "Harper", "Sumner", "Cowley",
                                               "Chautauqua", "Montgomery",
                                               "Labette", "Cherokee"))] <- "SE"

#add zip codes to data table
data("zip_code_db", package = "zipcodeR")
counties$zip <- lapply(1:nrow(counties), function(x){zipcodeR::search_county(counties$county[x], state_abb = "KS")$zipcode})

#create a thesaurus of which zip codes are in which quadrants
quad_inds <- c()
quad_zips <- c()
for(i in 1:nrow(counties)){
  quad_inds <- c(quad_inds, rep(counties$quadrant[i], length(counties$zip[[i]])))
  quad_zips <- c(quad_zips, counties$zip[[i]])
}
quad_thes <- data.frame(quadrant = quad_inds, zip = quad_zips)
rm(list = c("quad_inds", "quad_zips"))

#get quadrant for each brand in the dataset
brands_quadrants <- quad_thes$quadrant[match(brands[, 9], quad_thes$zip)]

#get number of components per brand
n_components <- sapply(1:nrow(brands), function(x){4 - length(which(brands[x, 1:4] == 0))})

#get number of two- and three-component brand that are unique to 1990 (old) or unique to the later years (young)
#first, for each unique brand code (discounting location), we need the years in which it appears
concat_brands <- as.factor(sapply(1:nrow(brands), function(x){paste0(brands[x, 1:8], collapse = " ")}))
unique_concat_brands <- levels(concat_brands)
years_appear <- lapply(unique_concat_brands, function(x){unique(brands[which(concat_brands == x), 10])})

#now, we can get which brands are either only old or only young
which_old <- sapply(1:length(concat_brands), function(x){all(years_appear[[which(unique_concat_brands == concat_brands[x])]] == 1990)})
which_young <- sapply(1:length(concat_brands), function(x){all(years_appear[[which(unique_concat_brands == concat_brands[x])]] != 1990)})

#what percent of component types are singletons
all_components <- c(brands[, 1:4])
all_components <- all_components[-which(all_components == 0)]
percent_singleton <- round((length(which(as.numeric(table(all_components)) == 1))/length(as.numeric(table(all_components))))*100, 2)
```

The total number of brands in the dataset, after removing duplicates within zip codes, is `r scales::comma(nrow(brands))`. For brands with two components, `r scales::comma(length(which(n_components == 2 & which_old)))` of them only appear in 1990 ("old") and `r scales::comma(length(which(n_components == 2 & which_young)))` of them only appear in 2008-2016 ("young"). For brands with three components, `r scales::comma(length(which(n_components == 3 & which_old)))` of them are old and `r scales::comma(length(which(n_components == 3 & which_young)))` of them are young. Only `r round((length(which(n_components == 4))/length(n_components))*100, 2)`% of brands have four components, and only `r percent_singleton`% of component types are singletons (i.e. only appear once). Here is a further breakdown of the brands in terms of the four geographic quadrants described in the preregistration document, for each combination of age and complexity:

```{r echo = FALSE, eval = FALSE}
#create function for removing singletons from subset
singleton_removal <- function(matrix){
  to_remove <- which(sapply(1:nrow(matrix), function(x){length(which(matrix[x, 1:4] %in% as.numeric(names(which(table(c(matrix[, 1:4])) == 1))))) > 0}))
  return(matrix[-to_remove, ])
}

#set random seed
set.seed(12345)

#create structured and mixed datasets
o_ne_2 <- brands[which(which_old & brands_quadrants == "NE" & n_components == 2), ]
o_ne_2_mixed <- brands[sample(which(brands_quadrants == "NE" & n_components == 2), nrow(o_ne_2)), ]
o_ne_3 <- brands[which(which_old & brands_quadrants == "NE" & n_components == 3), ]
o_ne_3_mixed <- brands[sample(which(brands_quadrants == "NE" & n_components == 3), nrow(o_ne_3)), ]
y_ne_2 <- brands[which(which_young & brands_quadrants == "NE" & n_components == 2), ]
y_ne_2_mixed <- brands[sample(which(brands_quadrants == "NE" & n_components == 2), nrow(y_ne_2)), ]
y_ne_3 <- brands[which(which_young & brands_quadrants == "NE" & n_components == 3), ]
y_ne_3_mixed <- brands[sample(which(brands_quadrants == "NE" & n_components == 3), nrow(y_ne_3)), ]
o_nw_2 <- brands[which(which_old & brands_quadrants == "NW" & n_components == 2), ]
o_nw_2_mixed <- brands[sample(which(brands_quadrants == "NW" & n_components == 2), nrow(o_nw_2)), ]
o_nw_3 <- brands[which(which_old & brands_quadrants == "NW" & n_components == 3), ]
o_nw_3_mixed <- brands[sample(which(brands_quadrants == "NW" & n_components == 3), nrow(o_nw_3)), ]
y_nw_2 <- brands[which(which_young & brands_quadrants == "NW" & n_components == 2), ]
y_nw_2_mixed <- brands[sample(which(brands_quadrants == "NW" & n_components == 2), nrow(y_nw_2)), ]
y_nw_3 <- brands[which(which_young & brands_quadrants == "NW" & n_components == 3), ]
y_nw_3_mixed <- brands[sample(which(brands_quadrants == "NW" & n_components == 3), nrow(y_nw_3)), ]
o_se_2 <- brands[which(which_old & brands_quadrants == "SE" & n_components == 2), ]
o_se_2_mixed <- brands[sample(which(brands_quadrants == "SE" & n_components == 2), nrow(o_se_2)), ]
o_se_3 <- brands[which(which_old & brands_quadrants == "SE" & n_components == 3), ]
o_se_3_mixed <- brands[sample(which(brands_quadrants == "SE" & n_components == 3), nrow(o_se_3)), ]
y_se_2 <- brands[which(which_young & brands_quadrants == "SE" & n_components == 2), ]
y_se_2_mixed <- brands[sample(which(brands_quadrants == "SE" & n_components == 2), nrow(y_se_2)), ]
y_se_3 <- brands[which(which_young & brands_quadrants == "SE" & n_components == 3), ]
y_se_3_mixed <- brands[sample(which(brands_quadrants == "SE" & n_components == 3), nrow(y_se_3)), ]
o_sw_2 <- brands[which(which_old & brands_quadrants == "SW" & n_components == 2), ]
o_sw_2_mixed <- brands[sample(which(brands_quadrants == "SW" & n_components == 2), nrow(o_sw_2)), ]
o_sw_3 <- brands[which(which_old & brands_quadrants == "SW" & n_components == 3), ]
o_sw_3_mixed <- brands[sample(which(brands_quadrants == "SW" & n_components == 3), nrow(o_sw_3)), ]
y_sw_2 <- brands[which(which_young & brands_quadrants == "SW" & n_components == 2), ]
y_sw_2_mixed <- brands[sample(which(brands_quadrants == "SW" & n_components == 2), nrow(y_sw_2)), ]
y_sw_3 <- brands[which(which_young & brands_quadrants == "SW" & n_components == 3), ]
y_sw_3_mixed <- brands[sample(which(brands_quadrants == "SW" & n_components == 3), nrow(y_sw_3)), ]

# #remove singletons from all of these
# o_ne_2 <- singleton_removal(o_ne_2)
# o_ne_2_mixed <- singleton_removal(o_ne_2_mixed)
# o_ne_3 <- singleton_removal(o_ne_3)
# o_ne_3_mixed <- singleton_removal(o_ne_3_mixed)
# y_ne_2 <- singleton_removal(y_ne_2)
# y_ne_2_mixed <- singleton_removal(y_ne_2_mixed)
# y_ne_3 <- singleton_removal(y_ne_3)
# y_ne_3_mixed <- singleton_removal(y_ne_3_mixed)
# o_nw_2 <- singleton_removal(o_nw_2)
# o_nw_2_mixed <- singleton_removal(o_nw_2_mixed)
# o_nw_3 <- singleton_removal(o_nw_3)
# o_nw_3_mixed <- singleton_removal(o_nw_3_mixed)
# y_nw_2 <- singleton_removal(y_nw_2)
# y_nw_2_mixed <- singleton_removal(y_nw_2_mixed)
# y_nw_3 <- singleton_removal(y_nw_3)
# y_nw_3_mixed <- singleton_removal(y_nw_3_mixed)
# o_se_2 <- singleton_removal(o_se_2)
# o_se_2_mixed <- singleton_removal(o_se_2_mixed)
# o_se_3 <- singleton_removal(o_se_3)
# o_se_3_mixed <- singleton_removal(o_se_3_mixed)
# y_se_2 <- singleton_removal(y_se_2)
# y_se_2_mixed <- singleton_removal(y_se_2_mixed)
# y_se_3 <- singleton_removal(y_se_3)
# y_se_3_mixed <- singleton_removal(y_se_3_mixed)
# o_sw_2 <- singleton_removal(o_sw_2)
# o_sw_2_mixed <- singleton_removal(o_sw_2_mixed)
# o_sw_3 <- singleton_removal(o_sw_3)
# o_sw_3_mixed <- singleton_removal(o_sw_3_mixed)
# y_sw_2 <- singleton_removal(y_sw_2)
# y_sw_2_mixed <- singleton_removal(y_sw_2_mixed)
# y_sw_3 <- singleton_removal(y_sw_3)
# y_sw_3_mixed <- singleton_removal(y_sw_3_mixed)

#save structured and mixed datasets
subsets <- list(o_ne_2 = o_ne_2, o_ne_2_mixed = o_ne_2_mixed, o_ne_3 = o_ne_3, o_ne_3_mixed = o_ne_3_mixed,
                y_ne_2 = y_ne_2, y_ne_2_mixed = y_ne_2_mixed, y_ne_3 = y_ne_3, y_ne_3_mixed = y_ne_3_mixed,
                o_nw_2 = o_nw_2, o_nw_2_mixed = o_nw_2_mixed, o_nw_3 = o_nw_3, o_nw_3_mixed = o_nw_3_mixed,
                y_nw_2 = y_nw_2, y_nw_2_mixed = y_nw_2_mixed, y_nw_3 = y_nw_3, y_nw_3_mixed = y_nw_3_mixed,
                o_se_2 = o_se_2, o_se_2_mixed = o_se_2_mixed, o_se_3 = o_se_3, o_se_3_mixed = o_se_3_mixed,
                y_se_2 = y_se_2, y_se_2_mixed = y_se_2_mixed, y_se_3 = y_se_3, y_se_3_mixed = y_se_3_mixed,
                o_sw_2 = o_sw_2, o_sw_2_mixed = o_sw_2_mixed, o_sw_3 = o_sw_3, o_sw_3_mixed = o_sw_3_mixed,
                y_sw_2 = y_sw_2, y_sw_2_mixed = y_sw_2_mixed, y_sw_3 = y_sw_3, y_sw_3_mixed = y_sw_3_mixed)
save(subsets, file = "analysis/shuffling_model/subsets.RData")
```

```{r echo = FALSE}
#construct and print table of sample sizes for each combination of age, complexity, and quadrant
sample_table <- rbind(as.numeric(table(brands_quadrants[which(n_components == 2 & which_old)])),
                      as.numeric(table(brands_quadrants[which(n_components == 3 & which_old)])),
                      as.numeric(table(brands_quadrants[which(n_components == 2 & which_young)])),
                      as.numeric(table(brands_quadrants[which(n_components == 3 & which_young)])))
sample_table <- as.data.frame(sample_table)
colnames(sample_table) <- c("NE", "NW", "SE", "SW")
rownames(sample_table) <- c("2-comp old", "3-comp old", "2-comp young", "3-comp young")
sample_table
```

In short, we have ample sample size to conduct all of the analyses proposed in the preregistration document.

Finally, if a large number of brands are registered in cities, where ranches are unlikely to be located, then it could be indicative of deeper data quality issues. The four [most populous counties in Kansas](http://www.usa.com/rank/kansas-state--population-density--county-rank.htm) are Johnson, Wyandotte, Sedgwick, and Shawnee. Collectively, these counties include the three biggest metropolitan areas: Kansas City, Topeka, and Wichita.

```{r}
#collect zip codes for top four counties
johnson <- zipcodeR::search_county("Johnson", "KS")
wyandotte <- zipcodeR::search_county("Wyandotte", "KS")
sedgwick <- zipcodeR::search_county("Sedgwick", "KS")
shawnee <- zipcodeR::search_county("Shawnee", "KS")

#compile p. o. box and normal zips
po_box_zips <- c(johnson$zipcode[which(johnson$zipcode_type == "PO Box")],
                 wyandotte$zipcode[which(wyandotte$zipcode_type == "PO Box")],
                 sedgwick$zipcode[which(sedgwick$zipcode_type == "PO Box")],
                 shawnee$zipcode[which(shawnee$zipcode_type == "PO Box")])
normal_zips <- c(johnson$zipcode[which(johnson$zipcode_type == "Standard")],
                 wyandotte$zipcode[which(wyandotte$zipcode_type == "Standard")],
                 sedgwick$zipcode[which(sedgwick$zipcode_type == "Standard")],
                 shawnee$zipcode[which(shawnee$zipcode_type == "Standard")])

#get proportion of brands that are in metropolitan areas, and in p. o. boxes
prop_metro <- length(which(brands[, 9] %in% c(po_box_zips, normal_zips)))/nrow(brands)
prop_po_box <- length(which(brands[, 9] %in% po_box_zips))/nrow(brands)
```

Luckily, it appears that only `r round(prop_metro*100, digits = 2)`% of brands are registered in metropolitan areas, and only `r round(prop_po_box*100, digits = 2)`% are registered to metropolitan P. O. Boxes. This suggests that brand registration in cities is not a significant issue.

## Description of the ABM

The agent-based model simulates ranchers creating new brands every year based on the components (and optionally angles) present in the brands used by ranchers around them, as well as dual constraints of simplicity and complexity. For example, with the parameters in the ABM we can run a model where ranchers copy components at a large geographic scale while being as different as possible from their closest neighbors, and try to create brands that are complex enough that they aren't easily faked but simple enough that are easily legible.

When a new brand is created, the components and angles around it are sampled within two radii: one in which components and angles are more likely to be used (henceforth copying), and one in which components and angles are less likely to be used (henceforth distinctiveness). All of the components present in each radii are compiled into frequency tables that are used for weighted random sampling. Once components have been sampled, all of the rotations of each component that appear in each radii are compiled as well. The probability that a rancher uses a particular component (or angle) is based on the frequency of it within the copying radius, raised to an exponent $C$, and the inverse frequency of it within the distinctiveness radius, raised to an exponent $D$.

<!-- $$P(x) = F_x^C \times \left(\frac{1}{F_x}\right)^D$$ -->
<center><img src="https://render.githubusercontent.com/render/math?math=P(x)%20=%20F_x^C%20%5Ctimes%20%5Cleft(%5Cfrac{1}{F_x}%5Cright)^D"></center>

$C$ and $D$ control the strength of copying and distinctiveness in the probability of adopting components and angles, where zero is neutrality, one is proportional to their observed frequencies, and values greater than one increase their influence beyond their observed frequencies. When only a subset of angles are possible for a particular component then only the frequencies of those particular angles are considered.

The map below shows an example of copying and distinctiveness radii (200 km and 100 km, respectively) around a target zip code, shown by the green triangle. In this example, components and angles within the larger copying radius (blue) would be more likely to be appear in the target zip code, whereas brands within the smaller distinctiveness radius (red) would be less likely to be appear. Here we plot the geographic centroids of each zip code rather than their boundaries, as the only shapefiles available at the zip code level are for the US Census' "zip code tabulation areas", which do not include all locations in the dataset.

```{r echo=FALSE, fig.align="center", dpi=300, out.width="75%", fig.width=5.5, fig.height=3}
#get zip codes within 200 km and 100 km of target location
copy_example <- names(which(zip_dists[, which(colnames(zip_dists) == 67427)] <= 200))
dist_example <- names(which(zip_dists[, which(colnames(zip_dists) == 67427)] <= 100))

kansas_map <- ggplot2::map_data("state", "kansas")
map_example <- data.frame(zip = all_zips$zip, lon = all_zips$lon, lat = all_zips$lat,
                          color = rep(0, length(all_zips$zip)),
                          shape = rep(0, length(all_zips$zip)))
map_example$color[which(map_example$zip %in% copy_example)] <- 1
map_example$color[which(map_example$zip %in% dist_example)] <- 2
map_example$color[which(map_example$zip == 67427)] <- 3
map_example$shape[which(map_example$zip == 67427)] <- 1
map_example$color <- as.factor(map_example$color)
map_example$shape <- as.factor(map_example$shape)

ggplot2::ggplot(kansas_map, ggplot2::aes(x = long, y = lat)) +
  ggplot2::geom_point(map_example, mapping = ggplot2::aes(x = lon, y = lat, colour = color, shape = shape, size = shape)) + 
  ggplot2::geom_polygon(fill = "transparent", colour = "black") + 
  ggplot2::scale_colour_manual(values = c("grey90", "#0072B2", "#D55E00", "#009E73")) +
  ggplot2::scale_shape_manual(values = c(16, 17)) +
  ggplot2::scale_size_manual(values = c(2, 8)) +
  ggplot2::coord_quickmap() + 
  ggplot2::theme_minimal() + 
  ggplot2::labs(x = "Longitude", y = "Latitude") + 
  ggplot2::theme(legend.position = "none")

#remove objects
rm(list = c("copy_example", "dist_example", "kansas_map", "map_example"))
```

To simulate simplicity and complexity, the number of components in each new brand is drawn from a Poisson distribution where $\lambda$ is the parameter of interest (henceforth complexity). Below is an example of the normalized probabilities of creating a brand with between one and four components when $\lambda$ is 0.5, 3, and 15. Note that the normalization here is only for plotting purposes - We will using the raw output of `dpois` as the `prob` argument of base R's `sample` function.

```{r echo = FALSE, fig.align="center", dpi=300, out.width="75%", fig.width=5, fig.height=3}
#create data frame for poisson example
pois_example <- data.frame(n_comps = c(1, 2, 3, 4),
                           probs = c(BBmisc::normalize(dpois(c(1, 2, 3, 4), 0.5), "range"),
                              BBmisc::normalize(dpois(c(1, 2, 3, 4), 3), "range"),
                              BBmisc::normalize(dpois(c(1, 2, 3, 4), 15), "range")),
                           lambda = c(rep(0.5, 4), rep(3, 4), rep(15, 4)))

#plot poisson example
ggplot2::ggplot(data = pois_example, ggplot2::aes(x = n_comps, y = probs, group = as.factor(lambda))) + 
  ggplot2::geom_line(ggplot2::aes(color = as.factor(lambda))) + 
  ggplot2::geom_point(ggplot2::aes(color = as.factor(lambda))) + 
  ggplot2::labs(x = "Number of Components", y = "Normalized Probabilities", color = "\u03bb") + 
  ggplot2::scale_color_manual(values = c("#0072B2", "#D55E00", "#009E73")) + 
  ggplot2::theme_linedraw()

#remove object
rm(pois_example)
```

At the beginning of each year in the ABM a set of `n_new` brands is created, where `n_new` is the average number of new brands that appear each year in the observed data. Each new brand is assigned a zip code, where the probability of each zip code is proportional to its frequency in all years of the observed data. After new brands are created a set of `n_old` random brands is removed, where `n_old` is the average number of brands that disappear each year in the observed data.

The ABM is initialized with the earliest year of observed data and runs through every year until the final year of observed data. In order to fit the parameters of the ABM to the observed data we also need to calculate a set of summary statistics that capture both the overall diversity and spatial diversity of components and brands. Right now the ABM collects the following summary statistics at the end of each year:

* For components:
  - Overall diversity:
    1. Proportion of components that are the most common type
    2. Proportion of components that are the most rare type
    3. Shannon's diversity index
    4. Simpson's diversity index
  - Spatial diversity:
    1. Jaccard index of beta diversity (zip codes)
    2. Morisita-Horn index of beta diversity (zip codes)
    3. Jaccard index of beta diversity (counties)
    4. Morisita-Horn index of beta diversity (counties)
* For brands:
  - Overall diversity:
    1. Mean Levenshtein distance (from random 10%)

For all diversity metrics calculate their Hill number counterparts, because they are [measured on the same scale](https://onlinelibrary.wiley.com/doi/10.1111/oik.07202) and [better account for relative abundance](https://esajournals.onlinelibrary.wiley.com/doi/abs/10.1890/13-0133.1). Shannon's diversity index emphasizes more rare types whereas Simpson's diversity index emphasizes more common types. The Jaccard and Morisita-Horn indices were similarly chosen for their complementarity. The Morisita-Horn index is a commonly used [abundance-based](https://onlinelibrary.wiley.com/doi/10.1111/j.1541-0420.2005.00489.x) beta diversity index, whereas the Jaccard index is the most robust of the [incidence-based](https://esajournals.onlinelibrary.wiley.com/doi/10.1002/ecs2.2100) beta diversity indices to sampling error. We calculate beta diversity at both the zip code and county-level to assess spatial diversity at two different resolutions. Diversity indices are not calculated at the level of brands, since duplicated brands in the model are not actually the same type (i.e. duplicates in the model and data correspond to slight variations of the same set of components that are only coded as the same). The mean Levenshtein distance (a.k.a. edit distance), or the [minimum number of insertions, deletions, and substitions required to convert one sequence to another](https://stat.ethz.ch/R-manual/R-devel/library/utils/html/adist.html), is calculated from a random 10% of brands (specified by the `edit_dist_prop` argument of the ABM function).

The ABM parameters will be fit to the observed data using the random forest version of approximate Bayesian computation (ABC). Random forest ABC is [robust to the number of summary statistics](https://academic.oup.com/bioinformatics/article/32/6/859/1744513) and even [ranks them according to their importance](https://academic.oup.com/bioinformatics/article/35/10/1720/5132692), which means that we will not have to [reduce the dimensionality](https://projecteuclid.org/journals/statistical-science/volume-28/issue-2/A-Comparative-Review-of-Dimension-Reduction-Methods-in-Approximate-Bayesian/10.1214/12-STS406.full) of our summary statistics prior to inference.

## Test of the ABM

The ABM functions are in the `cattlebrandABM.R` file. Please refer to the (heavily commented) functions in this file for details.

```{r}
source("cattlebrandABM.R")
```

Before test the ABM we need to calculate a couple of things - First the probability of rotation, which for now is just the proportion of brands in the full dataset that are rotated.

```{r}
#probability of rotation (proportion of rotated brands in the full dataset)
rot_prob <- as.numeric(1-(table(brands[, 5:8])[1]/sum(table(brands[, 5:8]))))
```

Now we can subset the brand data to (1) calculate the average number of new and old brands per year, and (2) have the first year of data separated to initialize the model with.

```{r}
#separate brands data by year
brands_1990 <- data.table::data.table(brands[which(brands[, 10] == 1990), 1:9])
brands_2008 <- data.table::data.table(brands[which(brands[, 10] == 2008), 1:9])
brands_2014 <- data.table::data.table(brands[which(brands[, 10] == 2014), 1:9])
brands_2015 <- data.table::data.table(brands[which(brands[, 10] == 2015), 1:9])
brands_2016 <- data.table::data.table(brands[which(brands[, 10] == 2016), 1:9])

#calculate average number of new brands that appear each year and old brands that disappear each year (fsetdiff gets rows of first that are not in second)
n_new <- mean(c((nrow(data.table::fsetdiff(brands_2008, brands_1990))/18),
                (nrow(data.table::fsetdiff(brands_2014, brands_2008))/6),
                nrow(data.table::fsetdiff(brands_2015, brands_2014)),
                nrow(data.table::fsetdiff(brands_2016, brands_2015))))
n_old <- mean(c((nrow(data.table::fsetdiff(brands_1990, brands_2008))/18),
                (nrow(data.table::fsetdiff(brands_2008, brands_2014))/6),
                nrow(data.table::fsetdiff(brands_2014, brands_2015)),
                nrow(data.table::fsetdiff(brands_2015, brands_2016))))
```

We also need to reshape components so it includes whether or not a component is rotatable (or includes at least one comma).

```{r}
#reshape components
components <- data.table::data.table(components = components, rotatable = rotation$rot)
components$rotatable[which(components$rotatable == "NA")] <- NA
components$rotatable <- sapply(1:nrow(components), function(x){as.numeric(components$rotatable[[x]])})

#save
save(components, file = "components.RData")
```

Eventually we can use the code below to set the limits of our prior for the sizes of the two radii.

```{r}
#get min and max distances between zip codes (to inform prior range)
round(min(zip_dists[which(zip_dists != 0)]))
round(max(zip_dists))
```

Okay, now we are ready to do a test run of the ABM. We'll set the complexity ($\lambda$) to 3, the copying radius to 200 km, the distinctive radius to 100 km, and the strength of both copying and distinctiveness to 1. The model will be initialized with the data from 1990, and summary statistics will be collected for 2008, 2014, 2015, and 2016. We'll also run one model that ignores the angles of components, and one that takes it into account

```{r}
#test out the components-only ABM (and get runtime)
start <- Sys.time()
components_only <- cattlebrandABM(init_brands = as.matrix(brands_1990), components, all_zips, zip_dists,
                                  init_year = 1990, sampling_years = c(2008, 2014, 2015, 2016), n_new, n_old,
                                  rot_prob, complexity = 3, copy_radius = 200, copy_strength = 1,
                                  dist_radius = 100, dist_strength = 1, angles = FALSE, edit_dist_prop = 0.1)
Sys.time() - start

#print output
components_only

#test out the components and angles ABM (and get runtime)
start <- Sys.time()
components_angles <- cattlebrandABM(init_brands = as.matrix(brands_2008), components, all_zips, zip_dists,
                                    init_year = 1990, sampling_years = c(2008, 2014, 2015, 2016), n_new, n_old, 
                                    rot_prob, complexity = 3, copy_radius = 200, copy_strength = 1, 
                                    dist_radius = 100, dist_strength = 1, angles = TRUE, edit_dist_prop = 0.1)
Sys.time() - start

#print output
components_angles
```

The output of each model is a matrix with a row for each of the four sampling years, and a column for each of the nine summary statistics collected in that year. After some profiling and optimization (using `profvis`) the runtime for the agent-based model is just barely fast enough for generative inference. I tried to further optimize the way that unique brands and component/angle combinations are handled by leaving them as matrices of integers instead of converting them into concatenated strings, but surprisingly the string method is significantly faster.

Now let's ensure that variation in our parameter values actually leads to variation in our summary statistics. To do this, we'll run 1,000 simulations of the model assuming the same radii as above but with varying parameter values for `complexity`, `copy_strength`, and `dist_strength`. By plotting each dynamic parameter value against the summary statistics that they generate, we can insure that variation in one corresponds to variation in the other.

```{r eval=FALSE}
#set number of simulations and cores
n_sims <- 1000
n_cores <- parallel::detectCores()-1

#get uniform priors for three main dynamic parameters
priors <- data.frame(complexity = runif(n_sims, min = 0.5, max = 15), 
                     copy_strength = runif(n_sims, min = 0, max = 5),
                     dist_strength = runif(n_sims, min = 0, max = 5))

#run simulations
sum_stats <- parallel::mclapply(1:n_sims, function(x){cattlebrandABM(init_brands = as.matrix(brands_2008), components, all_zips, zip_dists,
                                                                     init_year = 1990, sampling_years = c(2016), n_new, n_old,
                                                                     rot_prob, complexity = priors$complexity[x], copy_radius = 200,
                                                                     copy_strength = priors$copy_strength[x], dist_radius = 100,
                                                                     dist_strength = priors$dist_strength[x], angles = FALSE, edit_dist_prop = 0.1)}, mc.cores = n_cores)

#simplify and save output
sum_stats <- do.call("rbind", sum_stats)
sim_test <- cbind(priors, sum_stats)
save(sim_test, file = "sim_test.RData")
```

In this plot, each row corresponds to the nine summary statistics and each column corresponds to the three dynamic parameters in the ABM (A: complexity, B: copying strength, C: distinctiveness strength).

```{r echo=FALSE, fig.align="center", fig.height=10, fig.width=3.5}
load("sim_test.RData")

plots_1 <- lapply(1:9, function(x){ggplot2::ggplot(sim_test, ggplot2::aes(sim_test[, 1], sim_test[, x+3])) + ggplot2::geom_point(size = 0.2) + ggplot2::theme_void() + ggplot2::theme(axis.title.y = ggplot2::element_text()) + ggplot2::labs(x = NULL, y = x)})
plots_2 <- lapply(1:9, function(x){ggplot2::ggplot(sim_test, ggplot2::aes(sim_test[, 2], sim_test[, x+3])) + ggplot2::geom_point(size = 0.2) + ggplot2::theme_void() + ggplot2::theme(axis.title.x = ggplot2::element_blank(), axis.title.y = ggplot2::element_blank())})
plots_3 <- lapply(1:9, function(x){ggplot2::ggplot(sim_test, ggplot2::aes(sim_test[, 3], sim_test[, x+3])) + ggplot2::geom_point(size = 0.2) + ggplot2::theme_void() + ggplot2::theme(axis.title.x = ggplot2::element_blank(), axis.title.y = ggplot2::element_blank())})

plots <- c(plots_1, plots_2, plots_3)
plots[[9]] <- plots[[9]] + ggplot2::theme(axis.title.x = ggplot2::element_text()) + ggplot2::labs(x = "A")
plots[[18]] <- plots[[18]] + ggplot2::theme(axis.title.x = ggplot2::element_text()) + ggplot2::labs(x = "B")
plots[[27]] <- plots[[27]] + ggplot2::theme(axis.title.x = ggplot2::element_text()) + ggplot2::labs(x = "C")

cowplot::plot_grid(plotlist = plots, nrow = 9, byrow = FALSE, label_x = c(1, 2, 3))
```