# Synthetic Population dataset
syn_ind <- read.csv("data/01_synthetic_population_dataset/00_0831_final_synthesized_individual.csv")
syn_hh <- read.csv("data/01_synthetic_population_dataset/00_0831_final_synthesized_household.csv")
CB2000CBP All Sectors: County Business Patterns: https://data.census.gov/cedsci/table?q=business%20establishment&g=0500000US36013&tid=CBP2020.CB2000CBP
B08007 SEX OF WORKERS BY PLACE OF WORK–STATE AND COUNTY LEVEL https://data.census.gov/cedsci/table?q=employment&g=0500000US36013%241500000&tid=ACSDT5Y2020.B08007
S2301 EMPLOYMENT STATUS (Chautauqua county) https://data.census.gov/cedsci/table?q=S2301&g=0500000US36013&tid=ACSST5Y2020.S2301
S1401 SCHOOL ENROLLMENT https://data.census.gov/cedsci/table?t=School%20Enrollment&g=0500000US36013&tid=ACSST5Y2020.S1401
Educational Institution Dataset: https://edg.epa.gov/metadata/catalog/main/home.page
NY GIS Clearinghouse (Public Schools K-12): http://gis.ny.gov/gisdata/inventories/details.cfm?DSID=1326
# ----- Size of business of all sections in Chau
css_business_size <- read.csv("data/ACSDT5Y2020_chau_block_group_cleaned/Chautauqua_county_business_patterns.csv")
# ----- Employment status at bg
ttt <- read.csv("data/ACSDT5Y2020_chau_block_group_cleaned/Chautauqua_bg_ACSDT5Y2020.B08007-2022-08-31T160100.csv")
ttt <- cln_census_geoid_2(ttt, idx_cbgroup)
ttt$n_wok_out_county_state <- ttt$n_wok_out_county + ttt$n_wok_out_state
ttt$p_wok_in_county <- round(ttt$n_wok_in_county / ttt$ttl_wokers,4)
ttt$p_wok_out_county_state <- 1 - ttt$p_wok_in_county
css_emp_place <- ttt
# ----- Employment_ratio county
ttt <- read.csv("data/ACSDT5Y2020_chau_block_group_cleaned/Chautauqua_county_ACSST5Y2020.S2301-2022-08-31T180418.csv")
css_emp_ratio <- ttt
# ----- Public K-12 in NYS ---------
tt <- st_read("data/NY_Public_K_12/Public_K_12.shp")
tt <- tt[lengths(st_intersects(tt, c_county)) >0, ]
c_pub_k12 <- tt
# ----- School district ---------
tt <- st_read("data/NY_SchDist/SchDist_2019_v3.shp")
tt <- tt[lengths(st_intersects(tt, c_county))>0, ]
c_school_dist <- tt
# ----- School enrollment ---------
tt <- read.csv("data/ACSDT5Y2020_chau_block_group_cleaned/Chautauqua_county_ACSST5Y2020.S1401-2022-09-06T221723.csv")
css_age_school_enroll <- tt
## This cell can validate the rural/urban data collected from the Census
tt <- css_age_gender[c("GEOID","ttl")]
tt <- left_join(c_bgroup["GEOID"], tt)
tt$area <- st_area(tt)
tt$density <- tt$ttl * 1000**2/ as.numeric(tt$area)
tt$cat <- cut(tt$density, breaks = c(0,100,10000),
labels = c("rural","suburban"))
tm_shape(tt) + tm_polygons(col = "cat",alpha = 0.5) +
tm_shape(c_urban) + tm_polygons(alpha = 0.5, col="red")
# Unique id of household with (n_member > 1)
lt_hhid <- syn_hh %>% filter(hh_size > 1) %>% pull(hh_id) %>% unique()
# Network data container
df_network <- data.frame(matrix(ncol = 4)) %>% setnames(c("Source","Target","Type","Relation"))
foo <- syn_ind # individual
tt <- syn_hh # household
nn <- df_network # network
for (i in 1:length(lt_hhid)) {
# unique household id
hh_idx <- lt_hhid[[i]]
# All individuals in a particular household
data <- foo %>% filter(hh_id == hh_idx)
a <- combn(data$ind_id,2)
data <- data.frame(Source = a[1,],
Target = a[2,],
Type = "Undirected",
Relation = "Family")
nn <- rbind(nn, data)
}
# # Save network result
# nn <- na.omit(nn)
# write.csv(nn,"data/02_network_dataset/XX_step1_family_network.csv", row.names = F)
# (Projected CRS: NAD83 / UTM zone 18N)
# datasets: c_county, c_bgroup (id:GEOID), c_parcel_r (id: PRINT_KEY)
# Join residential parcel with census block group
df <- c_parcel_r %>% st_centroid()
df <- st_join(df, c_bgroup, join = st_intersects)
df <- df[!duplicated(df$PRINT_KEY),]
st_geometry(df) <- NULL
# Prepare a df of residential parcel
tt <- df[c("PRINT_KEY","GEOID","PROP_CLASS")]
# remove recreational use
tt <- tt[!tt$PROP_CLASS %in% c("200","242","260"), ]
# two_family * 2
a <- tt[tt$PROP_CLASS=="220",]
tt <- rbind(tt,a)
# three family * 3
a <- tt[tt$PROP_CLASS=="230",]
a <- a[rep(seq_len(nrow(a)), each = 2),]
tt <- rbind(tt, a)
# rural residence
a <- tt[tt$PROP_CLASS %in% c ("240","241"),] # primary residential
a <- a[rep(seq_len(nrow(a)), each = 2),]
tt <- rbind(tt, a)
# estate and residence
a <- tt[tt$PROP_CLASS %in% c("250","280","281"),]
a <- a[rep(seq_len(nrow(a)), each = 29),]
tt <- rbind(tt, a)
rownames(tt) <- NULL
tt$prcl_idx <- sprintf("R_prcl_%06d",1:nrow(tt))
df_residence_list <- tt
# set random seed
set.seed(rm_seed)
# datasets
foo <- syn_ind # individual
tt <- syn_hh # household
zz <- df_residence_list # residential parcel list
# Initialize
tt$prcl_idx <- NA
zz$if_pop <- NA
for (i in 1:length(lt_geoid_bg)) {
idx <- lt_geoid_bg[[i]]
# GEOID: Individual Data
data <- foo %>% filter(GEOID == idx)
# GEOID: Household Data
df <- tt %>% filter(GEOID == idx)
# GEOID: residential parcel data
zzz <- zz %>% filter(GEOID == idx)
# Threshold
n <- min(nrow(zzz), # available residence
nrow(df)) # all household
for (j in 1:n) {
# randomly select a household : id
a <- df %>% filter(is.na(prcl_idx)) %>% slice_sample(n=1) %>% pull(hh_id)
# randomly select a residential parcel : id
b <- zzz %>% filter(is.na(if_pop)) %>% slice_sample(n=1) %>% pull(prcl_idx)
df[df$hh_id == a,]$prcl_idx <- b
zzz[zzz$prcl_idx == b, ]$if_pop <- a
}
# Save result
tt <- tt %>% left_join(df[c("hh_id","prcl_idx")], by = "hh_id") %>%
mutate(prcl_idx = coalesce(prcl_idx.x, prcl_idx.y)) %>%
select(-prcl_idx.x, -prcl_idx.y)
zz <- zz %>% left_join(zzz[c("prcl_idx","if_pop")], by="prcl_idx") %>%
mutate(if_pop = coalesce (if_pop.x, if_pop.y)) %>%
select(-if_pop.x, -if_pop.y)
print(i)
}
# populate rest households
# extract "household without parcel" and "parcel without household"
lt_temp <- tt[is.na(tt$prcl_idx),]
zzz <- zz[is.na(zzz$if_pop),]
for (i in 1:nrow(lt_temp)) {
a <- lt_temp %>% filter(is.na(prcl_idx)) %>% slice_sample(n=1) %>% pull(hh_id)
b <- zzz %>% filter(is.na(if_pop)) %>% slice_sample(n=1) %>% pull(prcl_idx)
lt_temp[lt_temp$hh_id == a, ]$prcl_idx <- b
zzz[zzz$prcl_idx == b,]$if_pop <- a
}
# Fill original dataset
tt <- tt %>% left_join(lt_temp[c("hh_id","prcl_idx")], by = "hh_id") %>%
mutate(prcl_idx = coalesce(prcl_idx.x, prcl_idx.y)) %>%
select(-prcl_idx.x, -prcl_idx.y)
zz <- zz %>% left_join(zzz[c("prcl_idx","if_pop")], by="prcl_idx") %>%
mutate(if_pop = coalesce (if_pop.x, if_pop.y)) %>%
select(-if_pop.x, -if_pop.y)
zz <- zz[!is.na(zz$if_pop),] # only keep occupied parcel
tt <- left_join(tt, zz[c("prcl_idx","PRINT_KEY","PROP_CLASS")])
# ----- Urban Residential Parcel ---------
lt_urban_parcel_r <- c_parcel_r[lengths(st_intersects(c_parcel_r, c_urban))>0,] %>% pull("PRINT_KEY")
tt$urban_rural <- "rural"
tt[tt$PRINT_KEY %in% lt_urban_parcel_r,]$urban_rural <- "urban"
# EXPORT
# write.csv(tt, "data/01_synthetic_population_dataset/XX_1014_household_parcel_idx_urban_rural.csv", row.names = F)
# syn_hh_prcl <- tt
# # ----- Plot ---------
# aa <- left_join(tt, c_parcel_r["PRINT_KEY"]) %>% st_as_sf()
# tmap_mode("plot")
# pdf("plot/xx_Parcel_residential_urban_rural.pdf")
# tm_shape(aa) + tm_polygons(col = "urban_rural", border.alpha = 0)
# dev.off()
# Prepare parcels for gq
df <- c_parcel_gq
df <- st_join(df, c_bgroup, join = st_intersects)
df <- df[!duplicated(df$PRINT_KEY),]
# urban/rural group quarter
df$urban_rural <- "rural"
df[lengths(st_intersects(df, c_urban))>0,]$urban_rural <- "urban"
st_geometry(df) <- NULL
df <- df[c("PRINT_KEY","GEOID","PROP_CLASS","urban_rural")]
df$prcl_idx <- sprintf("R_gq_%03d",1:nrow(df))
df_res_gq_list <- df
# Identify individuals in gq
foo <- syn_ind # individual
foo <- foo[foo$hh_role=="in_gq",]
foo$hh_id_2 <- paste(foo$hh_id, foo$GEOID, sep="_")
lt_gq <- foo[c("GEOID","hh_id","hh_id_2")] %>% mutate(act_size = 1)
lt_gq <- aggregate(act_size~hh_id + GEOID + hh_id_2, data = lt_gq, FUN =sum)
# different gq types
df <- df_res_gq_list
lt_temp <- lt_gq$hh_id %>% unique()
df[lt_temp] <- 0
df[df$PROP_CLASS %in% c("613","615"),]$gq_college_housing <- 1
df[df$PROP_CLASS %in% c("633"),c("gq_others_adult_over64","gq_others_adult")] <- 1
df[df$PROP_CLASS %in% c("641","642"),c("gq_others_adult_over64")] <- 1
df[df$PROP_CLASS %in% c("670"),c("gq_Juvenile_facility","gq_others_adult")] <- 1
df$filled <- 0
# Populate
row.names(lt_gq) <- NULL
lt_gq[c("PRINT_KEY","prcl_idx")] <- NA
for (i in 1:nrow(lt_gq)) {
idx <- lt_gq[i,]$GEOID
a <- lt_gq[i,]$hh_id
b <- df %>% filter(GEOID == idx) %>% filter(get(a) == 1)
if(nrow(b)>0){
b <- b %>% slice_sample(n=1)
df[df$PRINT_KEY == b$PRINT_KEY,]$filled <- 1
lt_gq[i, c("PRINT_KEY","prcl_idx")] <- b[c("PRINT_KEY","prcl_idx")]
}
}
zz <- lt_gq[is.na(lt_gq$prcl_idx),]
zzz <- lt_gq[!is.na(lt_gq$prcl_idx),]
for (i in 1:nrow(zz)) {
a <- zz[i,]$hh_id
b <- df %>% filter(get(a) == 1 & filled==0)
if(nrow(b) == 0){
b <- df %>% filter(get(a) == 1)
}
if(nrow(b) > 0){
b <- b %>% slice_sample(n=1)
df[df$PRINT_KEY == b$PRINT_KEY,]$filled <- 1
zz[i, c("PRINT_KEY","prcl_idx")] <- b[c("PRINT_KEY","prcl_idx")]
}
}
lt_gq <- rbind(zz,zzz)
lt_gq$type <- "in_gq"
lt_gq <- left_join(lt_gq, df_res_gq_list[c("prcl_idx","urban_rural","PROP_CLASS")])
setnames(lt_gq, old="act_size", new="hh_size")
rownames(lt_gq) <- NULL
# # export
# syn_hh_gq_prcl <- rbind(syn_hh_prcl, lt_gq[c("GEOID","hh_id","type","hh_size","prcl_idx","PRINT_KEY","PROP_CLASS","urban_rural")])
# write.csv(syn_hh_gq_prcl, "data/01_synthetic_population_dataset/XX_1014_household_gq_parcel_idx_urban_rural.csv", row.names = F)
# Individual - urban/rural
tt <- left_join(syn_ind, syn_hh_gq_prcl[c("GEOID","hh_id","urban_rural")], by=c("GEOID"="GEOID","hh_id"="hh_id"))
# syn_ind_urb <- tt
# write.csv(syn_ind_urb, "data/01_synthetic_population_dataset/XX_1014_individual_urban_rural.csv", row.names = F)
df <- data.frame(matrix(ncol = 5)) %>% setnames(new = c("business_id",
"Meaning_of_Employment_size_of_establishments_code",
"employment_size_min","employment_size_max","act_size"))
tt <- css_business_size[2:10, 4:7]
rownames(tt) <- NULL
for (i in 1:nrow(tt)) {
a <- tt[i, 1:3]
b <- tt[i,4]
data <- a[rep(seq_len(nrow(a)), each = b),]
data$business_id <- NA
data$act_size <- 0
df <- rbind(df, data)
}
df <- df[!is.na(df$Meaning_of_Employment_size_of_establishments_code),]
rownames(df) <- NULL
df$business_id <- sprintf("Org_%05d", 1:nrow(df))
df_business <- df
1: Who is employed based on employment-population ratio 2. Who is working in county
Labor Force Participation Rate includes the numbers of people with a job as well as the number actively looking for work. We used employment-population ratio
# set random seed
set.seed(rm_seed)
# datasets
foo <- syn_ind_urb # individual
# initialize
foo$if_employed <- 0
foo$wok_place <- 0
# randomly select employeed individuals at COUNTY level
for (i in 1:nrow(css_emp_ratio)) {
a <- css_emp_ratio[i,]
data <- foo %>% filter(age >= a$age_lo & age <= a$age_hi)
# calculate # of employeed
m <- round(nrow(data) * a$Employment_Population_Ratio,0)
b <- data %>% slice_sample(n = m) %>% pull(ind_id) # random selection
foo[foo$ind_id %in% b,]$if_employed <- 1
}
# decide their working place (in/out county) at block groups level
for (i in 1:length(lt_geoid_bg)) {
idx <- lt_geoid_bg[[i]]
m <- css_emp_place %>% filter(GEOID == idx)
data <- foo %>% filter(GEOID == idx & if_employed == 1)
# "in_county", "out_county_or_state"
n <- round(nrow(data) * m$p_wok_in_county, 0)
a <- data %>% slice_sample(n=n) %>% pull(ind_id)
foo[foo$ind_id %in% data$ind_id, ]$wok_place <- "out_county_or_state"
foo[foo$ind_id %in% a,]$wok_place <- "in_county"
# CHECK
# print(paste(idx, (m$ttl_wokers - nrow(data)) / m$ttl_wokers, sep = "-----------------------"))
}
# # EXPORT
# write.csv(foo, "data/01_synthetic_population_dataset/XX_1014_individual_urban_rural_emp_place.csv", row.names = F)
# syn_ind_urb_emp <- foo
# set random seed
set.seed(rm_seed)
# datasets
foo <- syn_ind_urb_emp # individual
zz <- data.frame(matrix(ncol = 5, nrow = 0)) %>% setnames(new = names(df_business))
# initialize & extract only individuals working "in_county"
foo$business_id <- NA
data <- foo %>% filter(wok_place == "in_county")
# create a list of positions
for (i in 1:nrow(df_business)) {
a <- df_business[i,]
a <- a[rep(seq_len(nrow(a)), each = a$employment_size_max),]
zz <- rbind(zz, a)
}
rownames(zz) <- NULL
zz$position_idx <- sprintf("P_%05d",1:nrow(zz))
# connect employees with positions
for (i in 1:nrow(data)) {
# Randomly select an in-county worker & a company
a <- data %>% filter(is.na(business_id)) %>% slice_sample(n=1) %>% pull(ind_id)
b <- zz %>% filter(act_size < 1) %>% slice_sample(n=1)
# assign value back
data[data$ind_id == a, ]$business_id <- b$business_id
zz[zz$position_idx == b$position_idx,]$act_size <- 1
}
zzz <- zz[zz$act_size ==1,]
zzz <- table(zzz$business_id) %>% as.data.frame()
zzz <- left_join(df_business, zzz, by=c("business_id"="Var1")) %>% mutate(act_size = Freq) %>% select(-Freq)
foo <- left_join(syn_ind_urb_emp, data[c("ind_id","business_id")])
# # EXPORT
# write.csv(foo,"data/01_synthetic_population_dataset/XX_1014_individual_urban_rural_emp_place_orgID.csv", row.names = F)
# syn_ind_urb_empID <- foo
#
# df_business_pop <- zzz[!is.na(zzz$act_size),]
# write.csv(df_business_pop,"data/01_synthetic_population_dataset/XX_1014_business_in_county_act_size.csv", row.names = F)
# Unique School district code: c_school_dist["SDLCODE"]; c_pub_k12["SDL_CODE"]
# Unique School id: c_pub_k12["SDL_CODE"]
# tm_shape(c_school_dist["SDLCODE"]) + tm_polygons(alpha = 0, border.col = "red") +
# tm_shape(c_pub_k12["SDL_CODE"]) + tm_dots(col="blue")
data <- c_pub_k12[,c(3,41:59)] # unique id: SED_CODE
st_geometry(data) <- NULL
# Create age range
lt <- sprintf("age_%02d", 3:19)
data[,lt] <- NA
data[is.na(data)] <- 0
# Decide age range for each grade
data[data$GRADE_PK == "Y", c("age_03","age_04")] <- "Y"
data[data$GRADE_FK == "Y", c("age_04","age_05","age_06")] <- "Y"
data[data$GRADE_1 == "Y", c("age_06","age_07")] <- "Y"
data[data$GRADE_2 == "Y", c("age_07","age_08")] <- "Y"
data[data$GRADE_3 == "Y", c("age_08","age_09")] <- "Y"
data[data$GRADE_4 == "Y", c("age_09","age_10")] <- "Y"
data[data$GRADE_5 == "Y", c("age_10","age_11")] <- "Y"
data[data$GRADE_6 == "Y", c("age_11","age_12")] <- "Y"
data[data$GRADE_7 == "Y", c("age_12","age_13")] <- "Y"
data[data$GRADE_8 == "Y", c("age_13","age_14")] <- "Y"
data[data$GRADE_9 == "Y", c("age_14","age_15")] <- "Y"
data[data$GRADE_10 == "Y", c("age_15","age_16")] <- "Y"
data[data$GRADE_11 == "Y", c("age_16","age_17")] <- "Y"
data[data$GRADE_12 == "Y", c("age_17","age_18","age_19")] <- "Y"
data <- left_join(c_pub_k12[,c("SED_CODE","SDL_CODE")], data[,c(1,21:37)])
c_pub_k12_cln <- data
c_pub_k12_cln <- st_as_sf(c_pub_k12_cln)
# Identify Child age between 2-19 who enrolls in school
# css_age_school_enroll
data <- syn_ind_urb_empID
data$if_school <- 0
for (i in 1:nrow(css_age_school_enroll)) {
a <- css_age_school_enroll[i,2:5]
age_min <- as.numeric(a$age_min)
age_max <- as.numeric(a$age_max)
m <- data %>% filter(age >= age_min & age <= age_max & hh_role != "in_gq")
n <- round(nrow(m) * a$Enrol_school / a$Total, 0)
df <- m %>% slice_sample(n=n) %>% pull(ind_id)
data[data$ind_id %in% df,]$if_school <- 1 }
# Join schoolers with parcel index
data <- left_join(data, syn_hh_gq_prcl[c("hh_id","GEOID","PRINT_KEY")], by=c("hh_id"="hh_id","GEOID"="GEOID")) %>%
filter(if_school == 1 & hh_role != "in_gq")
data$school_id <- 0
# convert to sf object and get school district ID
data <- left_join(data, c_parcel_r["PRINT_KEY"]) %>% st_as_sf() %>% st_centroid()
data <- st_join(data, c_school_dist["SDLCODE"], join=st_intersects)
# c_pub_k12_cln - age vector
lt <- names(c_pub_k12_cln)[3:19] # age range: 3 - 19 yrs
for (i in 1:nrow(data)) {
a <- data[i, ] # child enrolled in school
b <- c_pub_k12_cln %>% filter(get(lt[a$age - 2])=="Y" & SDL_CODE == a$SDLCODE)
if(nrow(b)>0){
b <- b %>% slice_sample(n=1) %>% pull("SED_CODE")
} else {
b <- c_pub_k12_cln %>% filter(get(lt[a$age - 2])=="Y") %>% st_as_sf()
b <- st_join(a, b["SED_CODE"], join = st_nearest_feature) %>% pull("SED_CODE")
}
data[i,]$school_id <- b
print(i)
}
# # save result
# df <- data
# st_geometry(df) <- NULL
# df <- left_join(syn_ind_urb_empID, df[c("ind_id","if_school","school_id","SDLCODE")])
#
# write.csv(df, "data/01_synthetic_population_dataset/XX_1014_individual_urban_rural_emp_place_orgID_sch.csv", row.names = F)
# syn_ind_urb_empID_sch <- df
# set random seed
set.seed(rm_seed)
# Network data containner
df_network <- data.frame(matrix(ncol = 4)) %>% setnames(c("Source","Target","Type","Relation"))
nn <- df_network
foo <- syn_ind_urb_empID_sch # individual
zz <- df_business_pop[!is.na(df_business_pop$act_size),] # business
threhd <- 10 # the threshold of network size to generate a full-connected network
zzz <- zz[zz$act_size > threhd,]
zz <- zz[(zz$act_size <= threhd & zz$act_size > 1),]
# full connected working network
for (i in 1:nrow(zz)) {
idx <- zz[i,]$business_id
# All individuals in a particular company
data <- foo %>% filter(business_id == idx)
a <- combn(data$ind_id,2)
data <- data.frame(Source = a[1,],
Target = a[2,],
Type = "Undirected",
Relation = "Work")
nn <- rbind(nn, data)
print(i)
}
# Scale free working network with n >10
mm <- df_network
lt <- c("avg_degree","diameter")
zzz[lt] <- NA
for (i in 1:nrow(zzz)) {
idx <- zzz[i,]$business_id
n <- zzz[i,]$act_size
# All individuals in a particular company
data <- foo %>% filter(business_id == idx) %>% pull(ind_id) %>% sample()
# Generate Network
g <- sample_pa(n, m = 6, directed =F, power = 1)
zzz[i, lt] <- c(mean(degree(g)), igraph::diameter(g))
g <- as.data.frame(get.edgelist(g)) %>% setnames(new=c("Source","Target"))
g$Source <- mapvalues(g$Source, from=c(1:n), to=data, warn_missing = F)
g$Target <- mapvalues(g$Target, from=c(1:n), to=data, warn_missing = F)
g$Type = "Undirected"
g$Relation = "Work"
mm <- rbind(mm, g)
print(i)
}
nn <- rbind(nn, mm)
nn <- nn[!is.na(nn$Source),]
# export
# ntwk_work <- nn # average degree = 10:025
# write.csv(ntwk_work, "data/02_network_dataset/XX_step2_1015_working_network.csv", row.names = F)
# set random seed
set.seed(rm_seed)
# Network data containner
df_network <- data.frame(matrix(ncol = 4)) %>% setnames(c("Source","Target","Type","Relation"))
nn <- df_network
foo <- syn_ind_urb_empID_sch # individual
zz <- c_pub_k12_cln # public K-12
tt <- table(foo$school_id) %>% as.data.frame() %>% setnames(new=c("SED_CODE","n_child"))
zz <- left_join(zz, tt)
lt <- c("avg_degree","diameter")
zz[lt] <- NA
for (i in 1:nrow(zz)) {
idx <- zz[i,]$SED_CODE
n <- zz[i,]$n_child
# All child in a particular school
data <- foo %>% filter(school_id == idx) %>% pull(ind_id) %>% sample()
# Generate network based on PA
g <- sample_pa(n, m = 3, directed =F, power = 1)
zz[i, lt] <- c(mean(degree(g)), igraph::diameter(g))
g <- as.data.frame(get.edgelist(g)) %>% setnames(new=c("Source","Target"))
g$Source <- mapvalues(g$Source, from=c(1:n), to=data, warn_missing = F)
g$Target <- mapvalues(g$Target, from=c(1:n), to=data, warn_missing = F)
g$Type = "Undirected"
g$Relation = "School"
nn <- rbind(nn, g)
print(i)
}
c_pub_k12_cln_ntwk <- zz
nn <- nn[!is.na(nn$Source),]
# # export
# ntwk_school <- nn
# write.csv(ntwk_school, "data/02_network_dataset/XX_step3_1015_k12_school_network.csv", row.names = F)
# individual living in group quarter
foo <- syn_ind_urb_empID_sch %>% filter(hh_role=="in_gq") %>%
mutate(hh_id2 = paste(hh_id, GEOID, sep="_"))
tt <- foo["hh_id2"] %>% mutate(act_size = 1)
tt <- aggregate(act_size~hh_id2, data = tt, FUN =sum)
threhd <- 5 # the threshold of network size to generate a full-connected network
zzz <- tt[tt$act_size > threhd,]
zz <- tt[(tt$act_size <= threhd & tt$act_size > 1),]
# full connected gq network
nn <- df_network
for (i in 1:nrow(zz)) {
idx <- zz[i,]$hh_id2
# All individuals in a particular gq
data <- foo %>% filter(hh_id2 == idx)
a <- combn(data$ind_id,2)
data <- data.frame(Source = a[1,],
Target = a[2,],
Type = "Undirected",
Relation = "gq")
nn <- rbind(nn, data)
print(i)
}
# scale-free gq network
mm <- df_network
lt <- c("avg_degree","diameter")
zzz[lt] <- NA
for (i in 1:nrow(zzz)) {
idx <- zzz[i,]$hh_id2
n <- zzz[i,]$act_size
# All individuals in a particular gq
data <- foo %>% filter(hh_id2 == idx) %>% pull(ind_id) %>% sample()
# Generate Network
g <- sample_pa(n, m = 3, directed =F, power = 1)
zzz[i, lt] <- c(mean(degree(g)), igraph::diameter(g))
g <- as.data.frame(get.edgelist(g)) %>% setnames(new=c("Source","Target"))
g$Source <- mapvalues(g$Source, from=c(1:n), to=data, warn_missing = F)
g$Target <- mapvalues(g$Target, from=c(1:n), to=data, warn_missing = F)
g$Type = "Undirected"
g$Relation = "gq"
mm <- rbind(mm, g)
print(i)
}
nn <- rbind(nn, mm)
nn <- nn[!is.na(nn$Source),]
# # # export
# ntwk_gq <- nn
# write.csv(ntwk_gq, "data/02_network_dataset/XX_step4_1015_gq_network.csv", row.names = F)
# join individual with parcels
df <- left_join(syn_ind_urb_empID_sch_smedia_teen,
syn_hh_gq_prcl[c("GEOID","hh_id","prcl_idx","PRINT_KEY")], by=c("GEOID"="GEOID","hh_id"="hh_id"))
df <- left_join(df, c_parcel["PRINT_KEY"]) %>% st_as_sf()
# # get centroid of parcel and transform to wgs84 (4326)
# zz <- df %>% st_centroid() %>% st_as_sf() %>% st_transform(crs = 4326)
# keep project
zz <- df %>% st_centroid() %>% st_as_sf()
zzz <- zz %>% mutate(long = unlist(map(zz$geometry,1)), lat = unlist(map(zz$geometry,2)))
st_geometry(zzz) <- NULL
# Add randomness to centroid
# aa <- 0.001 # wgs 84
aa <- 0.05 # NAD 83
zzz$long <- zzz$long + runif(nrow(zzz), -aa, aa)
zzz$lat <- zzz$lat+ runif(nrow(zzz), -aa, aa)
rownames(zzz) <- NULL
zzz["ind_new_id"] <- 1:nrow(zzz) - 1
# Python export
write.csv(zzz, "data/01_synthetic_population_dataset/XX_999_model_individual_urban_rural_emp_place_orgID_sch_smedia_teen_NAD83.csv", row.names = F)
# tt <- st_as_sf(zzz, coords = c("long","lat"), crs=4326)
#
# tmap_mode("plot")
# tm_shape(tt) + tm_dots()
# # Export
# write.csv(zzz, "data/01_synthetic_population_dataset/XX_1311_individual_urban_rural_emp_place_orgID_sch_smedia_teen_coord.csv", row.names = F)
# syn_ind_urb_empID_sch_smedia_teen_coord <- zzz
ntwk_family <- read.csv("data/02_network_dataset/step1_0831_family_network.csv")[,2:5]
ntwk_all <- rbind(ntwk_family,
ntwk_work,
ntwk_school,
ntwk_gq,
ntwk_fb,
ntwk_smedia_teen)
# export network - save
# write.csv(ntwk_all, "data/02_network_dataset/xx_step_finl_all_network.csv", row.names = F)
data <- ntwk_all
data <- left_join(data, zzz[c("ind_id","ind_new_id")], by=c("Source"="ind_id"))
names(data)[[5]] <- "source_reindex"
data <- left_join(data, zzz[c("ind_id","ind_new_id")], by=c("Target"="ind_id"))
names(data)[[6]] <- "target_reindex"
write.csv(data, "data/02_network_dataset/step999_model_step_finl_all_network.csv", row.names = F)
# 1. physical space: nodes & edges
length(unique(c(ntwk_family$Source, ntwk_family$Target, ntwk_gq$Source, ntwk_gq$Target)))
nrow(ntwk_family) + nrow(ntwk_gq)
# 2.1 relational space - school: nodes & edges
length(unique(c(ntwk_school$Source, ntwk_school$Target)))
nrow(ntwk_school)
# 2.2 relational space - work: nodes & edges
length(unique(c(ntwk_work$Source, ntwk_work$Target)))
nrow(ntwk_work)
# 3. cyber space - social media
length(unique(c(ntwk_fb$Source, ntwk_fb$Target, ntwk_smedia_teen$Source, ntwk_smedia_teen$Target)))
nrow(ntwk_fb) + nrow(ntwk_smedia_teen)
tt <- syn_ind_urb_empID_sch_smedia_teen_coord
tt[tt$hh_role=="in_gq",]$hh_id <- paste(tt[tt$hh_role=="in_gq",]$hh_id,
tt[tt$hh_role=="in_gq",]$GEOID,
sep="_")
colnames(tt)[1] <- "Id"
nodes <- tt[tt$Id %in% c(ntwk_all$Source, ntwk_all$Target),]
# # export edges - gephi
# write.csv(ntwk_all, "data/02_network_dataset/network/2023xxxx_network_edge.csv", row.names = F)
# # export nodes - gephi
# write.csv(nodes, "data/02_network_dataset/network/2023xxxx_network_nodes.csv", row.names = F)
## Plot
pdf("plot/geo_school_district.pdf")
tm_shape(c_county) + tm_polygons(alpha = 0, border.col = "red") + tm_shape(c_school_dist) + tm_polygons(alpha = 0)
dev.off()
Social Media Usage - Facebook
ADULT USER [18,∞)
Adult: Social Media Network
average degree in county: 64
TEENS USER[13, 17]
Teen: Social Networks