Suggested citation: Madaga, Lavinia, Jordan Chamberlin, Bisrat Gebrekidan, Robert Hijmans, Nicholaus Kuboja, and Maxwell Mkondiwa. 2024. “Predictive mapping of wholesale grain prices for rural areas in Tanzania.” https://github.com/EiA2030-ex-ante/Tanzania-Price-Data
We are interested in estimating the prices of agricultural food commodities across space and time, on the basis of prices as observed at some market locations. Here we explore methods to model these prices, using monthly data from across Tanzania over the period May 2021-July 2024.
This document describes the process of fitting a Random Forest model to predict these prices. The performance of the Random Forest model is evaluated using Root Mean Square Errors (RMSE) and R-Square as test statistics. We also compare the observed prices with the predicted prices using an out of sample dataset.
library(geodata)
## Warning: package 'geodata' was built under R version 4.3.3
library(lubridate)
library(terra)
library(data.table)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
library(httr)
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'lattice' was built under R version 4.3.3
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.3.3
library(pdp)
## Warning: package 'pdp' was built under R version 4.3.3
library(gridExtra)
library(stats)
library(dplyr)
library(stringr)
library(fields)
## Warning: package 'fields' was built under R version 4.3.3
## Warning: package 'spam' was built under R version 4.3.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
library(ggplot2)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.3
library(gdistance)
## Warning: package 'gdistance' was built under R version 4.3.3
## Warning: package 'igraph' was built under R version 4.3.3
library(sp)
library(raster)
library(foreign)
library(gdistance)
library(foreach)
## Warning: package 'foreach' was built under R version 4.3.3
library(doParallel)
## Warning: package 'doParallel' was built under R version 4.3.3
## Warning: package 'iterators' was built under R version 4.3.3
library(tidyverse)
library(segmented)
## Warning: package 'segmented' was built under R version 4.3.3
library(modelbased)
## Warning: package 'modelbased' was built under R version 4.3.3
library(parameters)
## Warning: package 'parameters' was built under R version 4.3.3
library(RColorBrewer)
library(effsize)
## Warning: package 'effsize' was built under R version 4.3.3
library(gghalves)
## Warning: package 'gghalves' was built under R version 4.3.3
library(viridis)
This dataset contains price information for various crops across different regions and markets in Tanzania from 2021 to 2024. The data was acquired from Tanzania’s Ministry of Industry and Trade.
setwd("H:/Tanzania Price data/Datasets")
prices <- fread("Tanzania_Price_Data_AllCrops_with_Coordinates4.csv")
dim(prices)
## [1] 9450 21
head(prices)
## Region Market Maize..min.price. Maize..max.price. Rice..min.price.
## 1: Arusha Arusha 43000 45000 140000
## 2: Dar es Salaam Temeke 46000 47000 120000
## 3: Dodoma Majengo 45000 50000 132000
## 4: Dodoma Kibaigwa 30000 42000 NA
## 5: Kagera Bukoba 33000 35000 110000
## 6: Manyara Babati 30000 45000 120000
## Rice..max.price. Sorghum..min.price. Sorghum..max.price.
## 1: 170000 60000 65000
## 2: 210000 80000 100000
## 3: 200000 50000 60000
## 4: NA 45000 48000
## 5: 140000 140000 150000
## 6: 170000 60000 80000
## Bulrush.Millet..min.price. Bulrush.Millet..max.price.
## 1: 70000 75000
## 2: 80000 100000
## 3: 52000 64000
## 4: NA NA
## 5: 120000 140000
## 6: 80000 100000
## Finger.Millet..min.price. Finger.Millet..max.price. Wheat..min.price.
## 1: 120000 125000 70000
## 2: 175000 180000 110000
## 3: 200000 252000 NA
## 4: NA NA NA
## 5: 170000 180000 170000
## 6: 120000 130000 100000
## Wheat..max.price. Beans..min.price. Beans..max.price.
## 1: 78000 130000 165000
## 2: 120000 220000 260000
## 3: NA 200000 245000
## 4: NA NA NA
## 5: 180000 110000 170000
## 6: 120000 150000 180000
## Irish.Potatoes..min.price. Irish.Potatoes..max.price. Date Latitude
## 1: 70000 75000 5/5/2021 -3.36968
## 2: 75000 75000 5/5/2021 -6.85097
## 3: 56000 68000 5/5/2021 -6.17971
## 4: NA NA 5/5/2021 -6.08110
## 5: 60000 75000 5/5/2021 -1.33140
## 6: 60000 60000 5/5/2021 -4.21006
## Longitude
## 1: 36.68808
## 2: 39.25672
## 3: 35.74109
## 4: 36.64645
## 5: 31.81293
## 6: 35.74915
table(prices$Market)
##
## Arusha Babati Bariadi Buguruni Bukoba Igawilo
## 374 294 129 10 440 89
## Ilala Iringa Kibaigwa Kigoma Kilombero Kinondoni
## 254 443 211 298 89 203
## Lindi majengo Majengo Makambako Manyara Mbeya
## 297 61 479 9 116 117
## Mgandini Mnalani Morogoro Moshi Mpanda Mpimbwe
## 89 40 437 303 195 39
## Mtwara Musoma Mwananyamala Mwanza Namfua Njombe
## 451 338 89 400 89 174
## Nyankumbu Pwani Shinyanga Singida Songea Songwe
## 127 89 333 83 412 99
## Sumbawanga Tabora Tandale Tandika Tanga Temeke
## 436 415 106 115 361 220
## Ubungo
## 97
sapply(prices, class)
## Region Market
## "character" "character"
## Maize..min.price. Maize..max.price.
## "integer" "integer"
## Rice..min.price. Rice..max.price.
## "integer" "integer"
## Sorghum..min.price. Sorghum..max.price.
## "integer" "integer"
## Bulrush.Millet..min.price. Bulrush.Millet..max.price.
## "integer" "integer"
## Finger.Millet..min.price. Finger.Millet..max.price.
## "integer" "integer"
## Wheat..min.price. Wheat..max.price.
## "integer" "integer"
## Beans..min.price. Beans..max.price.
## "integer" "integer"
## Irish.Potatoes..min.price. Irish.Potatoes..max.price.
## "integer" "integer"
## Date Latitude
## "character" "numeric"
## Longitude
## "numeric"
# Convert to date format
prices$Date <- lubridate::mdy(prices$Date)
Check the Region and Market names and coodinates. Make sure the Region and Market names are harmonized and properly geocoded
unique(prices[Region=="Arusha",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Arusha -3.36968 36.68808
## 2: Kilombero -3.37324 36.67870
unique(prices[Region=="Dar es Salaam",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Temeke -6.850970 39.25672
## 2: Kinondoni -6.784070 39.27007
## 3: Ilala -6.829410 39.26289
## 4: Tandika -6.867912 39.25480
## 5: Buguruni -6.838380 39.24359
## 6: Tandale -6.795230 39.24085
## 7: Ubungo -6.793620 39.20966
## 8: Mwananyamala -6.788890 39.25840
unique(prices[Region=="Dodoma",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Majengo -6.17971 35.74109
## 2: Kibaigwa -6.08110 36.64645
unique(prices[Region=="Kagera",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Bukoba -1.3314 31.81293
unique(prices[Region=="Manyara",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Babati -4.21006 35.74915
## 2: Manyara -4.46011 37.20217
unique(prices[Region=="Rukwa",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Sumbawanga -7.95239 31.62319
unique(prices[Region=="Mpanda",.(Market, Latitude, Longitude)])
## Empty data.table (0 rows and 3 cols): Market,Latitude,Longitude
unique(prices[Region=="Mtwara",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Mtwara -10.28009 40.18191
unique(prices[Region=="Tabora",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Tabora -5.01972 32.80767
unique(prices[Region=="Tanga",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Tanga -5.074260 39.09993
## 2: Mgandini -5.086235 39.09561
unique(prices[Region=="Iringa",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Iringa -7.78001 35.69671
unique(prices[Region=="Kigoma",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Kigoma -4.89697 29.65987
unique(prices[Region=="Morogoro",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Morogoro -6.82771 37.66542
unique(prices[Region=="Mwanza",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Mwanza -2.51969 32.90144
unique(prices[Region=="Mara",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Musoma -1.49982 33.8083
unique(prices[Region=="Ruvuma",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Songea -10.67873 35.64836
unique(prices[Region=="Shinyanga",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Shinyanga -3.67226 33.43069
unique(prices[Region=="Kilimanjaro",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Moshi -3.34865 37.34352
unique(prices[Region=="Mbeya",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Mbeya -8.909940 33.45517
## 2: Igawilo -8.924246 33.56788
unique(prices[Region=="Katavi",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Mpanda -6.34295 31.07299
## 2: Mpimbwe -7.24425 31.81782
## 3: majengo -6.34809 31.07055
## 4: Majengo -6.34809 31.07055
unique(prices[Region=="Njombe",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Njombe -9.33805 34.76949
## 2: Makambako -8.84398 34.82299
unique(prices[Region=="Lindi",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Lindi -9.995 39.708
unique(prices[Region=="Singida",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Singida -4.812610 34.7428
## 2: Namfua -4.821121 34.7470
unique(prices[Region=="Pwani",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Pwani -6.44221 38.90622
## 2: Mnalani -6.44221 38.90622
unique(prices[Region=="Simiyu",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Bariadi -2.80355 33.98699
unique(prices[Region=="Geita",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Nyankumbu -2.870760 32.23408
## 2: Nyankumbu -2.895955 32.22871
unique(prices[Region=="Songwe",.(Market, Latitude, Longitude)])
## Market Latitude Longitude
## 1: Songwe -8.95179 33.24377
setnames(prices, old = "Maize..min.price.", new = "mai.price.min")
setnames(prices, old = "Rice..min.price.", new = "ric.price.min")
setnames(prices, old = "Sorghum..min.price.", new = "sor.price.min")
setnames(prices, old = "Bulrush.Millet..min.price.", new = "bul.price.min")
setnames(prices, old = "Finger.Millet..min.price.", new = "fin.price.min")
setnames(prices, old = "Wheat..min.price.", new = "whe.price.min")
setnames(prices, old = "Beans..min.price.", new = "bea.price.min")
setnames(prices, old = "Irish.Potatoes..min.price.", new = "pot.price.min")
setnames(prices, old = "Maize..max.price.", new = "mai.price.max")
setnames(prices, old = "Rice..max.price.", new = "ric.price.max")
setnames(prices, old = "Sorghum..max.price.", new = "sor.price.max")
setnames(prices, old = "Bulrush.Millet..max.price.", new = "bul.price.max")
setnames(prices, old = "Finger.Millet..max.price.", new = "fin.price.max")
setnames(prices, old = "Wheat..max.price.", new = "whe.price.max")
setnames(prices, old = "Beans..max.price.", new = "bea.price.max")
setnames(prices, old = "Irish.Potatoes..max.price.", new = "pot.price.max")
sapply(prices, class)
## Region Market mai.price.min mai.price.max ric.price.min
## "character" "character" "integer" "integer" "integer"
## ric.price.max sor.price.min sor.price.max bul.price.min bul.price.max
## "integer" "integer" "integer" "integer" "integer"
## fin.price.min fin.price.max whe.price.min whe.price.max bea.price.min
## "integer" "integer" "integer" "integer" "integer"
## bea.price.max pot.price.min pot.price.max Date Latitude
## "integer" "integer" "integer" "Date" "numeric"
## Longitude
## "numeric"
#convert prices to numeric
prices$mai.price.min <- as.numeric(prices$mai.price.min)
prices$ric.price.min <- as.numeric(prices$ric.price.min)
prices$sor.price.min <- as.numeric(prices$sor.price.min)
prices$bul.price.min <- as.numeric(prices$bul.price.min)
prices$fin.price.min <- as.numeric(prices$fin.price.min)
prices$whe.price.min <- as.numeric(prices$whe.price.min)
prices$bea.price.min <- as.numeric(prices$bea.price.min)
prices$pot.price.min <- as.numeric(prices$pot.price.min)
prices$mai.price.max <- as.numeric(prices$mai.price.max)
prices$ric.price.max <- as.numeric(prices$ric.price.max)
prices$sor.price.max <- as.numeric(prices$sor.price.max)
prices$bul.price.max <- as.numeric(prices$bul.price.max)
prices$fin.price.max <- as.numeric(prices$fin.price.max)
prices$whe.price.max <- as.numeric(prices$whe.price.max)
prices$bea.price.max <- as.numeric(prices$bea.price.max)
prices$pot.price.max <- as.numeric(prices$pot.price.max)
sapply(prices, class)
## Region Market mai.price.min mai.price.max ric.price.min
## "character" "character" "numeric" "numeric" "numeric"
## ric.price.max sor.price.min sor.price.max bul.price.min bul.price.max
## "numeric" "numeric" "numeric" "numeric" "numeric"
## fin.price.min fin.price.max whe.price.min whe.price.max bea.price.min
## "numeric" "numeric" "numeric" "numeric" "numeric"
## bea.price.max pot.price.min pot.price.max Date Latitude
## "numeric" "numeric" "numeric" "Date" "numeric"
## Longitude
## "numeric"
# convert to price per kg
prices$mai.price.min <- prices$mai.price.min/100
prices$ric.price.min <- prices$ric.price.min/100
prices$sor.price.min <- prices$sor.price.min/100
prices$bul.price.min <- prices$bul.price.min/100
prices$fin.price.min <- prices$fin.price.min/100
prices$whe.price.min <- prices$whe.price.min/100
prices$bea.price.min <- prices$bea.price.min/100
prices$pot.price.min <- prices$pot.price.min/100
prices$mai.price.max <- prices$mai.price.max/100
prices$ric.price.max <- prices$ric.price.max/100
prices$sor.price.max <- prices$sor.price.max/100
prices$bul.price.max <- prices$bul.price.max/100
prices$fin.price.max <- prices$fin.price.max/100
prices$whe.price.max <- prices$whe.price.max/100
prices$bea.price.max <- prices$bea.price.max/100
prices$pot.price.max <- prices$pot.price.max/100
# calculate average of min and max
prices$mai.price <- (prices$mai.price.min + prices$mai.price.max) / 2
prices$ric.price <- (prices$ric.price.min + prices$ric.price.max) / 2
prices$sor.price <- (prices$sor.price.min + prices$sor.price.max) / 2
prices$bul.price <- (prices$bul.price.min + prices$bul.price.max) / 2
prices$fin.price <- (prices$fin.price.min + prices$fin.price.max) / 2
prices$whe.price <- (prices$whe.price.min + prices$whe.price.max) / 2
prices$bea.price <- (prices$bea.price.min + prices$bea.price.max) / 2
prices$pot.price <- (prices$pot.price.min + prices$pot.price.max) / 2
#We can add dates by using the year and the month names
prices$Day <- day(prices$Date)
prices$Month <- month(prices$Date)
prices$Year <- year(prices$Date)
# drop unneccessary columns
prices <- prices[,!c("mai.price.min", "mai.price.max",
"ric.price.min", "ric.price.max",
"sor.price.min", "sor.price.max",
"bul.price.min", "bul.price.max",
"fin.price.min", "fin.price.max",
"whe.price.min", "whe.price.max",
"bea.price.min", "bea.price.max",
"pot.price.min", "pot.price.max")]
# calculate monthly mean prices by market
prices.monthly <- prices[, .(mai.price = mean(mai.price, na.rm = TRUE),
ric.price = mean(ric.price, na.rm = TRUE),
sor.price = mean(sor.price, na.rm = TRUE),
bul.price = mean(bul.price, na.rm = TRUE),
fin.price = mean(fin.price, na.rm = TRUE),
whe.price = mean(whe.price, na.rm = TRUE),
bea.price = mean(bea.price, na.rm = TRUE),
pot.price = mean(pot.price, na.rm = TRUE)),
by=.(Region, Market, Month, Year, Latitude, Longitude)]
# reshape to long (so that prices for different commodities can be simultaneously estimated)
prices.monthly
## Region Market Month Year Latitude Longitude mai.price
## 1: Arusha Arusha 5 2021 -3.36968 36.68808 469.00
## 2: Dar es Salaam Temeke 5 2021 -6.85097 39.25672 485.00
## 3: Dodoma Majengo 5 2021 -6.17971 35.74109 501.50
## 4: Dodoma Kibaigwa 5 2021 -6.08110 36.64645 375.00
## 5: Kagera Bukoba 5 2021 -1.33140 31.81293 426.25
## ---
## 1012: Pwani Mnalani 12 2024 -6.44221 38.90622 1300.00
## 1013: Simiyu Bariadi 12 2024 -2.80355 33.98699 720.00
## 1014: Kigoma Kigoma 12 2024 -4.89697 29.65987 722.00
## 1015: Njombe Makambako 12 2024 -8.84398 34.82299 569.50
## 1016: Rukwa Sumbawanga 12 2024 -7.95239 31.62319 676.25
## ric.price sor.price bul.price fin.price whe.price bea.price pot.price
## 1: 1530 695.0 761.0 1333.0 793.00 1545.0 745
## 2: 1650 900.0 900.0 1775.0 1230.00 2420.0 730
## 3: 1592 558.0 581.0 1752.0 NaN 2075.0 618
## 4: NaN 437.5 NaN NaN NaN NaN NaN
## 5: 1260 1375.0 1337.5 1637.5 1637.50 1300.0 675
## ---
## 1012: 1950 1700.0 1600.0 2300.0 2200.00 3250.0 1500
## 1013: 1500 1650.0 1900.0 2300.0 2500.00 2950.0 1450
## 1014: 1600 900.0 1600.0 2300.0 1750.00 2700.0 1000
## 1015: 2375 850.0 NaN 1625.0 1300.00 2550.0 590
## 1016: 1950 NaN NaN 975.0 843.75 2162.5 725
prices.monthly.long <- melt(prices.monthly, id.vars=c('Region', 'Market', 'Month', 'Year', 'Latitude', 'Longitude'),)
# rename columns
setnames(prices.monthly.long, old="variable", new="Crop")
setnames(prices.monthly.long, old="value", new="pkg")
# replace crop names
prices.monthly.long[Crop == "mai.price", Crop := "Maize"]
prices.monthly.long[Crop == "ric.price", Crop := "Rice"]
prices.monthly.long[Crop == "sor.price", Crop := "Sorghum"]
prices.monthly.long[Crop == "bul.price", Crop := "B.Millet"]
prices.monthly.long[Crop == "fin.price", Crop := "F.Millet"]
prices.monthly.long[Crop == "whe.price", Crop := "Wheat"]
prices.monthly.long[Crop == "bea.price", Crop := "Beans"]
prices.monthly.long[Crop == "pot.price", Crop := "Potato"]
# Reset the factor levels to updated levels
prices.monthly.long[, Crop := factor(Crop)]
# Check the unique values again
unique(prices.monthly.long$Crop)
## [1] Maize Rice Sorghum B.Millet F.Millet Wheat Beans Potato
## Levels: Maize Rice Sorghum B.Millet F.Millet Wheat Beans Potato
# generate dummies to use in place of factors (for later spatial predictions, which are struggling with factors)
prices.monthly.long[, maize := ifelse(Crop == "Maize",1,0)]
prices.monthly.long[, rice := ifelse(Crop == "Rice",1,0)]
prices.monthly.long[, sorghum := ifelse(Crop == "Sorghum",1,0)]
prices.monthly.long[, bmillet := ifelse(Crop == "B.Millet",1,0)]
prices.monthly.long[, fmillet := ifelse(Crop == "F.Millet",1,0)]
prices.monthly.long[, wheat := ifelse(Crop == "Wheat",1,0)]
prices.monthly.long[, beans := ifelse(Crop == "Beans",1,0)]
prices.monthly.long[, potato := ifelse(Crop == "Potato",1,0)]
prices.monthly.long[, jan := ifelse(Month == 1 , 1, 0)]
prices.monthly.long[, feb := ifelse(Month == 2 , 1, 0)]
prices.monthly.long[, mar := ifelse(Month == 3 , 1, 0)]
prices.monthly.long[, apr := ifelse(Month == 4 , 1, 0)]
prices.monthly.long[, may := ifelse(Month == 5 , 1, 0)]
prices.monthly.long[, jun := ifelse(Month == 6 , 1, 0)]
prices.monthly.long[, jul := ifelse(Month == 7 , 1, 0)]
prices.monthly.long[, aug := ifelse(Month == 8 , 1, 0)]
prices.monthly.long[, sep := ifelse(Month == 9 , 1, 0)]
prices.monthly.long[, oct := ifelse(Month == 10 , 1, 0)]
prices.monthly.long[, nov := ifelse(Month == 11 , 1, 0)]
prices.monthly.long[, dec := ifelse(Month == 12 , 1, 0)]
# replace NaN with NAs in the price observations
prices.monthly.long[is.nan(pkg), pkg := NA]
# Remove observations with missing observations
prices.monthly.long <- na.omit(prices.monthly.long)
# bring in raster stack as predictors
geodata_path("H:/Tanzania Price data/Datasets/geodata")
list.files("H:/Tanzania Price data/Datasets/geodata", recursive=TRUE)
## [1] "climate/wc2.1_country/TZA_wc2.1_30s_bio.tif"
## [2] "travel/travel_time_to_cities_u3.tif"
## [3] "travel/travel_time_to_cities_u5.tif"
## [4] "TRUE/gadm/gadm41_TZA_0_pk.rds"
## [5] "TRUE/gadm/gadm41_TZA_1_pk.rds"
## [6] "TRUE/gadm/gadm41_TZA_2_pk.rds"
## [7] "TRUE/gadm/gadm41_TZA_3_pk.rds"
## [8] "TRUE/spam/spam2017v2r1_ssa_area.zip"
## [9] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_A.tif"
## [10] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_H.tif"
## [11] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_I.tif"
## [12] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_L.tif"
## [13] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_R.tif"
## [14] "TRUE/spam/spam2017V2r1_SSA_H_MAIZ_S.tif"
## [15] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_A.tif"
## [16] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_H.tif"
## [17] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_I.tif"
## [18] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_L.tif"
## [19] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_R.tif"
## [20] "TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_S.tif"
## [21] "TRUE/spam/spam2017v2r1_ssa_yield.zip"
## [22] "TRUE/travel/travel_time_to_cities_u5.tif"
## [23] "TRUE/travel/travel_time_to_ports_1.tif"
## [24] "TRUE/wc2.1_country/TZA_wc2.1_30s_bio.tif"
# tza0 <- gadm(country="TZA", level=0)
# tza1 <- gadm(country="TZA", level=1)
# tza2 <- gadm(country="TZA", level=2)
# tza3 <- gadm(country="TZA", level=3)
tza0 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_0_pk.rds")
tza1 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_1_pk.rds")
tza2 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_2_pk.rds")
tza3 <- readRDS("H:/Tanzania Price data/Datasets/geodata/TRUE/gadm/gadm41_TZA_3_pk.rds")
# convert prices observations to vector for mapping
mypts <- vect(prices.monthly.long, geom=c("Longitude", "Latitude"), crs=crs(tza0), keepgeom=TRUE)
# see if these show up correctly
plot(tza1)
plot(mypts, col="Red", add=TRUE)
# text(mypts, label="Market")
# Extract Region information from GADM
region_info <- terra::extract(tza1, mypts)
head(region_info)
## id.y GID_1 GID_0 COUNTRY NAME_1 VARNAME_1 NL_NAME_1
## 1 1 TZA.1_1 TZA Tanzania Arusha <NA> <NA>
## 2 2 TZA.2_1 TZA Tanzania Dar es Salaam Dar-es-salaam|Pwani <NA>
## 3 3 TZA.3_1 TZA Tanzania Dodoma <NA> <NA>
## 4 4 TZA.3_1 TZA Tanzania Dodoma <NA> <NA>
## 5 5 TZA.6_1 TZA Tanzania Kagera <NA> <NA>
## 6 6 TZA.11_1 TZA Tanzania Manyara <NA> <NA>
## TYPE_1 ENGTYPE_1 CC_1 HASC_1 ISO_1
## 1 Mkoa Region 02 TZ.AS TZ-01
## 2 Mkoa Region 07 TZ.DS TZ-02
## 3 Mkoa Region 01 TZ.DO TZ-03
## 4 Mkoa Region 01 TZ.DO TZ-03
## 5 Mkoa Region 18 TZ.KG TZ-05
## 6 Mkoa Region 21 TZ.MY TZ-26
# merge district names to the dataset
prices.monthly.long$Region_GADM <- region_info$NAME_1
#prices.monthly.long$District_GADM <- region_info$NAME_2
head(prices.monthly.long)
## Region Market Month Year Latitude Longitude Crop pkg maize rice
## 1: Arusha Arusha 5 2021 -3.36968 36.68808 Maize 469.00 1 0
## 2: Dar es Salaam Temeke 5 2021 -6.85097 39.25672 Maize 485.00 1 0
## 3: Dodoma Majengo 5 2021 -6.17971 35.74109 Maize 501.50 1 0
## 4: Dodoma Kibaigwa 5 2021 -6.08110 36.64645 Maize 375.00 1 0
## 5: Kagera Bukoba 5 2021 -1.33140 31.81293 Maize 426.25 1 0
## 6: Manyara Babati 5 2021 -4.21006 35.74915 Maize 375.00 1 0
## sorghum bmillet fmillet wheat beans potato jan feb mar apr may jun jul aug
## 1: 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 2: 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 3: 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 4: 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 5: 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 6: 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## sep oct nov dec Region_GADM
## 1: 0 0 0 0 Arusha
## 2: 0 0 0 0 Dar es Salaam
## 3: 0 0 0 0 Dodoma
## 4: 0 0 0 0 Dodoma
## 5: 0 0 0 0 Kagera
## 6: 0 0 0 0 Manyara
unique(prices.monthly.long[, c("Region", "Market", "Region_GADM")])
## Region Market Region_GADM
## 1: Arusha Arusha Arusha
## 2: Dar es Salaam Temeke Dar es Salaam
## 3: Dodoma Majengo Dodoma
## 4: Dodoma Kibaigwa Dodoma
## 5: Kagera Bukoba Kagera
## 6: Manyara Babati Manyara
## 7: Rukwa Sumbawanga Rukwa
## 8: Katavi Mpanda Katavi
## 9: Mtwara Mtwara Mtwara
## 10: Tabora Tabora Tabora
## 11: Tanga Tanga Tanga
## 12: Dar es Salaam Kinondoni Dar es Salaam
## 13: Dar es Salaam Ilala Dar es Salaam
## 14: Iringa Iringa Iringa
## 15: Kigoma Kigoma Kigoma
## 16: Morogoro Morogoro Morogoro
## 17: Mwanza Mwanza Mwanza
## 18: Mara Musoma Mara
## 19: Ruvuma Songea Ruvuma
## 20: Shinyanga Shinyanga Shinyanga
## 21: Kilimanjaro Moshi Kilimanjaro
## 22: Mbeya Mbeya Mbeya
## 23: Njombe Njombe Njombe
## 24: Lindi Lindi Lindi
## 25: Manyara Manyara Manyara
## 26: Dar es Salaam Tandika Dar es Salaam
## 27: Dar es Salaam Buguruni Dar es Salaam
## 28: Dar es Salaam Tandale Dar es Salaam
## 29: Singida Singida Singida
## 30: Pwani Pwani Pwani
## 31: Simiyu Bariadi Simiyu
## 32: Katavi Mpimbwe Katavi
## 33: Geita Nyankumbu Geita
## 34: Songwe Songwe Mbeya
## 35: Dar es Salaam Mwananyamala Dar es Salaam
## 36: Singida Namfua Singida
## 37: Arusha Kilombero Arusha
## 38: Tanga Mgandini Tanga
## 39: Katavi majengo Katavi
## 40: Mbeya Igawilo Mbeya
## 41: Pwani Mnalani Pwani
## 42: Katavi Majengo Katavi
## 43: Njombe Makambako Njombe
## 44: Dar es Salaam Ubungo Dar es Salaam
## Region Market Region_GADM
# Check if Region matches Region_GADM
prices.monthly.long$region_match <- prices.monthly.long$Region == prices.monthly.long$Region_GADM
table(prices.monthly.long$region_match)
##
## FALSE TRUE
## 73 6985
# Show mismatched rows
mis <- subset(prices.monthly.long, region_match == FALSE)
mis
## Region Market Month Year Latitude Longitude Crop pkg maize rice
## 1: Songwe Songwe 3 2024 -8.95179 33.24377 Maize 366.5000 1 0
## 2: Songwe Songwe 4 2024 -8.95179 33.24377 Maize 435.3889 1 0
## 3: Songwe Songwe 5 2024 -8.95179 33.24377 Maize 438.4615 1 0
## 4: Songwe Songwe 6 2024 -8.95179 33.24377 Maize 433.0000 1 0
## 5: Songwe Songwe 7 2024 -8.95179 33.24377 Maize 433.0000 1 0
## 6: Songwe Songwe 8 2024 -8.95179 33.24377 Maize 433.0000 1 0
## 7: Songwe Songwe 9 2024 -8.95179 33.24377 Maize 440.6667 1 0
## 8: Songwe Songwe 10 2024 -8.95179 33.24377 Maize 526.8333 1 0
## 9: Songwe Songwe 11 2024 -8.95179 33.24377 Maize 536.2500 1 0
## 10: Songwe Songwe 12 2024 -8.95179 33.24377 Maize 552.5000 1 0
## 11: Songwe Songwe 3 2024 -8.95179 33.24377 Rice 2400.0000 0 1
## 12: Songwe Songwe 4 2024 -8.95179 33.24377 Rice 2133.3333 0 1
## 13: Songwe Songwe 5 2024 -8.95179 33.24377 Rice 1921.1538 0 1
## 14: Songwe Songwe 6 2024 -8.95179 33.24377 Rice 1700.0000 0 1
## 15: Songwe Songwe 7 2024 -8.95179 33.24377 Rice 1700.0000 0 1
## 16: Songwe Songwe 8 2024 -8.95179 33.24377 Rice 1700.0000 0 1
## 17: Songwe Songwe 9 2024 -8.95179 33.24377 Rice 1702.0833 0 1
## 18: Songwe Songwe 10 2024 -8.95179 33.24377 Rice 1837.5000 0 1
## 19: Songwe Songwe 11 2024 -8.95179 33.24377 Rice 1904.1667 0 1
## 20: Songwe Songwe 12 2024 -8.95179 33.24377 Rice 1950.0000 0 1
## 21: Songwe Songwe 3 2024 -8.95179 33.24377 Sorghum 1500.0000 0 0
## 22: Songwe Songwe 4 2024 -8.95179 33.24377 Sorghum 1500.0000 0 0
## 23: Songwe Songwe 5 2024 -8.95179 33.24377 Sorghum 1500.0000 0 0
## 24: Songwe Songwe 6 2024 -8.95179 33.24377 Sorghum 1500.0000 0 0
## 25: Songwe Songwe 7 2024 -8.95179 33.24377 Sorghum 1500.0000 0 0
## 26: Songwe Songwe 8 2024 -8.95179 33.24377 Sorghum 1500.0000 0 0
## 27: Songwe Songwe 9 2024 -8.95179 33.24377 Sorghum 1500.0000 0 0
## 28: Songwe Songwe 10 2024 -8.95179 33.24377 Sorghum 1750.0000 0 0
## 29: Songwe Songwe 11 2024 -8.95179 33.24377 Sorghum 2000.0000 0 0
## 30: Songwe Songwe 12 2024 -8.95179 33.24377 Sorghum 2000.0000 0 0
## 31: Songwe Songwe 10 2024 -8.95179 33.24377 B.Millet 2000.0000 0 0
## 32: Songwe Songwe 11 2024 -8.95179 33.24377 B.Millet 2000.0000 0 0
## 33: Songwe Songwe 12 2024 -8.95179 33.24377 B.Millet 2000.0000 0 0
## 34: Songwe Songwe 3 2024 -8.95179 33.24377 F.Millet 1500.0000 0 0
## 35: Songwe Songwe 4 2024 -8.95179 33.24377 F.Millet 1500.0000 0 0
## 36: Songwe Songwe 5 2024 -8.95179 33.24377 F.Millet 1538.4615 0 0
## 37: Songwe Songwe 6 2024 -8.95179 33.24377 F.Millet 2000.0000 0 0
## 38: Songwe Songwe 7 2024 -8.95179 33.24377 F.Millet 2000.0000 0 0
## 39: Songwe Songwe 8 2024 -8.95179 33.24377 F.Millet 2000.0000 0 0
## 40: Songwe Songwe 9 2024 -8.95179 33.24377 F.Millet 2000.0000 0 0
## 41: Songwe Songwe 10 2024 -8.95179 33.24377 F.Millet 1583.3333 0 0
## 42: Songwe Songwe 11 2024 -8.95179 33.24377 F.Millet 1791.6667 0 0
## 43: Songwe Songwe 12 2024 -8.95179 33.24377 F.Millet 2000.0000 0 0
## 44: Songwe Songwe 3 2024 -8.95179 33.24377 Wheat 2500.0000 0 0
## 45: Songwe Songwe 4 2024 -8.95179 33.24377 Wheat 2055.5556 0 0
## 46: Songwe Songwe 5 2024 -8.95179 33.24377 Wheat 2000.0000 0 0
## 47: Songwe Songwe 6 2024 -8.95179 33.24377 Wheat 2000.0000 0 0
## 48: Songwe Songwe 7 2024 -8.95179 33.24377 Wheat 2000.0000 0 0
## 49: Songwe Songwe 8 2024 -8.95179 33.24377 Wheat 2000.0000 0 0
## 50: Songwe Songwe 9 2024 -8.95179 33.24377 Wheat 2000.0000 0 0
## 51: Songwe Songwe 10 2024 -8.95179 33.24377 Wheat 2000.0000 0 0
## 52: Songwe Songwe 11 2024 -8.95179 33.24377 Wheat 2000.0000 0 0
## 53: Songwe Songwe 12 2024 -8.95179 33.24377 Wheat 2000.0000 0 0
## 54: Songwe Songwe 3 2024 -8.95179 33.24377 Beans 1500.0000 0 0
## 55: Songwe Songwe 4 2024 -8.95179 33.24377 Beans 1500.0000 0 0
## 56: Songwe Songwe 5 2024 -8.95179 33.24377 Beans 1823.0769 0 0
## 57: Songwe Songwe 6 2024 -8.95179 33.24377 Beans 2000.0000 0 0
## 58: Songwe Songwe 7 2024 -8.95179 33.24377 Beans 2000.0000 0 0
## 59: Songwe Songwe 8 2024 -8.95179 33.24377 Beans 2000.0000 0 0
## 60: Songwe Songwe 9 2024 -8.95179 33.24377 Beans 2000.0000 0 0
## 61: Songwe Songwe 10 2024 -8.95179 33.24377 Beans 2150.0000 0 0
## 62: Songwe Songwe 11 2024 -8.95179 33.24377 Beans 2433.3333 0 0
## 63: Songwe Songwe 12 2024 -8.95179 33.24377 Beans 2850.0000 0 0
## 64: Songwe Songwe 3 2024 -8.95179 33.24377 Potato 1200.0000 0 0
## 65: Songwe Songwe 4 2024 -8.95179 33.24377 Potato 1466.6667 0 0
## 66: Songwe Songwe 5 2024 -8.95179 33.24377 Potato 1392.3077 0 0
## 67: Songwe Songwe 6 2024 -8.95179 33.24377 Potato 1500.0000 0 0
## 68: Songwe Songwe 7 2024 -8.95179 33.24377 Potato 1500.0000 0 0
## 69: Songwe Songwe 8 2024 -8.95179 33.24377 Potato 1500.0000 0 0
## 70: Songwe Songwe 9 2024 -8.95179 33.24377 Potato 1500.0000 0 0
## 71: Songwe Songwe 10 2024 -8.95179 33.24377 Potato 1500.0000 0 0
## 72: Songwe Songwe 11 2024 -8.95179 33.24377 Potato 1616.6667 0 0
## 73: Songwe Songwe 12 2024 -8.95179 33.24377 Potato 1900.0000 0 0
## Region Market Month Year Latitude Longitude Crop pkg maize rice
## sorghum bmillet fmillet wheat beans potato jan feb mar apr may jun jul aug
## 1: 0 0 0 0 0 0 0 0 1 0 0 0 0 0
## 2: 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 3: 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 4: 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 5: 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## 6: 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 7: 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 8: 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 9: 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 10: 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 11: 0 0 0 0 0 0 0 0 1 0 0 0 0 0
## 12: 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 13: 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 14: 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 15: 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## 16: 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 17: 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 18: 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 19: 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 20: 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 21: 1 0 0 0 0 0 0 0 1 0 0 0 0 0
## 22: 1 0 0 0 0 0 0 0 0 1 0 0 0 0
## 23: 1 0 0 0 0 0 0 0 0 0 1 0 0 0
## 24: 1 0 0 0 0 0 0 0 0 0 0 1 0 0
## 25: 1 0 0 0 0 0 0 0 0 0 0 0 1 0
## 26: 1 0 0 0 0 0 0 0 0 0 0 0 0 1
## 27: 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 28: 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 29: 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 30: 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 31: 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## 32: 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## 33: 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## 34: 0 0 1 0 0 0 0 0 1 0 0 0 0 0
## 35: 0 0 1 0 0 0 0 0 0 1 0 0 0 0
## 36: 0 0 1 0 0 0 0 0 0 0 1 0 0 0
## 37: 0 0 1 0 0 0 0 0 0 0 0 1 0 0
## 38: 0 0 1 0 0 0 0 0 0 0 0 0 1 0
## 39: 0 0 1 0 0 0 0 0 0 0 0 0 0 1
## 40: 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 41: 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 42: 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 43: 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 44: 0 0 0 1 0 0 0 0 1 0 0 0 0 0
## 45: 0 0 0 1 0 0 0 0 0 1 0 0 0 0
## 46: 0 0 0 1 0 0 0 0 0 0 1 0 0 0
## 47: 0 0 0 1 0 0 0 0 0 0 0 1 0 0
## 48: 0 0 0 1 0 0 0 0 0 0 0 0 1 0
## 49: 0 0 0 1 0 0 0 0 0 0 0 0 0 1
## 50: 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 51: 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 52: 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 53: 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 54: 0 0 0 0 1 0 0 0 1 0 0 0 0 0
## 55: 0 0 0 0 1 0 0 0 0 1 0 0 0 0
## 56: 0 0 0 0 1 0 0 0 0 0 1 0 0 0
## 57: 0 0 0 0 1 0 0 0 0 0 0 1 0 0
## 58: 0 0 0 0 1 0 0 0 0 0 0 0 1 0
## 59: 0 0 0 0 1 0 0 0 0 0 0 0 0 1
## 60: 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 61: 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 62: 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 63: 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 64: 0 0 0 0 0 1 0 0 1 0 0 0 0 0
## 65: 0 0 0 0 0 1 0 0 0 1 0 0 0 0
## 66: 0 0 0 0 0 1 0 0 0 0 1 0 0 0
## 67: 0 0 0 0 0 1 0 0 0 0 0 1 0 0
## 68: 0 0 0 0 0 1 0 0 0 0 0 0 1 0
## 69: 0 0 0 0 0 1 0 0 0 0 0 0 0 1
## 70: 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 71: 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 72: 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 73: 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## sorghum bmillet fmillet wheat beans potato jan feb mar apr may jun jul aug
## sep oct nov dec Region_GADM region_match
## 1: 0 0 0 0 Mbeya FALSE
## 2: 0 0 0 0 Mbeya FALSE
## 3: 0 0 0 0 Mbeya FALSE
## 4: 0 0 0 0 Mbeya FALSE
## 5: 0 0 0 0 Mbeya FALSE
## 6: 0 0 0 0 Mbeya FALSE
## 7: 1 0 0 0 Mbeya FALSE
## 8: 0 1 0 0 Mbeya FALSE
## 9: 0 0 1 0 Mbeya FALSE
## 10: 0 0 0 1 Mbeya FALSE
## 11: 0 0 0 0 Mbeya FALSE
## 12: 0 0 0 0 Mbeya FALSE
## 13: 0 0 0 0 Mbeya FALSE
## 14: 0 0 0 0 Mbeya FALSE
## 15: 0 0 0 0 Mbeya FALSE
## 16: 0 0 0 0 Mbeya FALSE
## 17: 1 0 0 0 Mbeya FALSE
## 18: 0 1 0 0 Mbeya FALSE
## 19: 0 0 1 0 Mbeya FALSE
## 20: 0 0 0 1 Mbeya FALSE
## 21: 0 0 0 0 Mbeya FALSE
## 22: 0 0 0 0 Mbeya FALSE
## 23: 0 0 0 0 Mbeya FALSE
## 24: 0 0 0 0 Mbeya FALSE
## 25: 0 0 0 0 Mbeya FALSE
## 26: 0 0 0 0 Mbeya FALSE
## 27: 1 0 0 0 Mbeya FALSE
## 28: 0 1 0 0 Mbeya FALSE
## 29: 0 0 1 0 Mbeya FALSE
## 30: 0 0 0 1 Mbeya FALSE
## 31: 0 1 0 0 Mbeya FALSE
## 32: 0 0 1 0 Mbeya FALSE
## 33: 0 0 0 1 Mbeya FALSE
## 34: 0 0 0 0 Mbeya FALSE
## 35: 0 0 0 0 Mbeya FALSE
## 36: 0 0 0 0 Mbeya FALSE
## 37: 0 0 0 0 Mbeya FALSE
## 38: 0 0 0 0 Mbeya FALSE
## 39: 0 0 0 0 Mbeya FALSE
## 40: 1 0 0 0 Mbeya FALSE
## 41: 0 1 0 0 Mbeya FALSE
## 42: 0 0 1 0 Mbeya FALSE
## 43: 0 0 0 1 Mbeya FALSE
## 44: 0 0 0 0 Mbeya FALSE
## 45: 0 0 0 0 Mbeya FALSE
## 46: 0 0 0 0 Mbeya FALSE
## 47: 0 0 0 0 Mbeya FALSE
## 48: 0 0 0 0 Mbeya FALSE
## 49: 0 0 0 0 Mbeya FALSE
## 50: 1 0 0 0 Mbeya FALSE
## 51: 0 1 0 0 Mbeya FALSE
## 52: 0 0 1 0 Mbeya FALSE
## 53: 0 0 0 1 Mbeya FALSE
## 54: 0 0 0 0 Mbeya FALSE
## 55: 0 0 0 0 Mbeya FALSE
## 56: 0 0 0 0 Mbeya FALSE
## 57: 0 0 0 0 Mbeya FALSE
## 58: 0 0 0 0 Mbeya FALSE
## 59: 0 0 0 0 Mbeya FALSE
## 60: 1 0 0 0 Mbeya FALSE
## 61: 0 1 0 0 Mbeya FALSE
## 62: 0 0 1 0 Mbeya FALSE
## 63: 0 0 0 1 Mbeya FALSE
## 64: 0 0 0 0 Mbeya FALSE
## 65: 0 0 0 0 Mbeya FALSE
## 66: 0 0 0 0 Mbeya FALSE
## 67: 0 0 0 0 Mbeya FALSE
## 68: 0 0 0 0 Mbeya FALSE
## 69: 0 0 0 0 Mbeya FALSE
## 70: 1 0 0 0 Mbeya FALSE
## 71: 0 1 0 0 Mbeya FALSE
## 72: 0 0 1 0 Mbeya FALSE
## 73: 0 0 0 1 Mbeya FALSE
## sep oct nov dec Region_GADM region_match
unique(mis[, c("Region", "Market", "Region_GADM")])
## Region Market Region_GADM
## 1: Songwe Songwe Mbeya
# Songwe is mismatched, rename correctly
prices.monthly.long[Region_GADM == "Mbeya" & Region == "Songwe", Region_GADM := "Songwe"]
unique(prices.monthly.long[, c("Region", "Market", "Region_GADM")])
## Region Market Region_GADM
## 1: Arusha Arusha Arusha
## 2: Dar es Salaam Temeke Dar es Salaam
## 3: Dodoma Majengo Dodoma
## 4: Dodoma Kibaigwa Dodoma
## 5: Kagera Bukoba Kagera
## 6: Manyara Babati Manyara
## 7: Rukwa Sumbawanga Rukwa
## 8: Katavi Mpanda Katavi
## 9: Mtwara Mtwara Mtwara
## 10: Tabora Tabora Tabora
## 11: Tanga Tanga Tanga
## 12: Dar es Salaam Kinondoni Dar es Salaam
## 13: Dar es Salaam Ilala Dar es Salaam
## 14: Iringa Iringa Iringa
## 15: Kigoma Kigoma Kigoma
## 16: Morogoro Morogoro Morogoro
## 17: Mwanza Mwanza Mwanza
## 18: Mara Musoma Mara
## 19: Ruvuma Songea Ruvuma
## 20: Shinyanga Shinyanga Shinyanga
## 21: Kilimanjaro Moshi Kilimanjaro
## 22: Mbeya Mbeya Mbeya
## 23: Njombe Njombe Njombe
## 24: Lindi Lindi Lindi
## 25: Manyara Manyara Manyara
## 26: Dar es Salaam Tandika Dar es Salaam
## 27: Dar es Salaam Buguruni Dar es Salaam
## 28: Dar es Salaam Tandale Dar es Salaam
## 29: Singida Singida Singida
## 30: Pwani Pwani Pwani
## 31: Simiyu Bariadi Simiyu
## 32: Katavi Mpimbwe Katavi
## 33: Geita Nyankumbu Geita
## 34: Songwe Songwe Songwe
## 35: Dar es Salaam Mwananyamala Dar es Salaam
## 36: Singida Namfua Singida
## 37: Arusha Kilombero Arusha
## 38: Tanga Mgandini Tanga
## 39: Katavi majengo Katavi
## 40: Mbeya Igawilo Mbeya
## 41: Pwani Mnalani Pwani
## 42: Katavi Majengo Katavi
## 43: Njombe Makambako Njombe
## 44: Dar es Salaam Ubungo Dar es Salaam
## Region Market Region_GADM
# Check if Region matches Region_GADM
prices.monthly.long$region_match <- prices.monthly.long$Region == prices.monthly.long$Region_GADM
table(prices.monthly.long$region_match)
##
## TRUE
## 7058
# convert prices observations to vector for mapping
mypts <- vect(prices.monthly.long, geom=c("Longitude", "Latitude"), crs=crs(tza0), keepgeom=TRUE)
mkt_pts <- unique(mypts[, c("Market", "Latitude", "Longitude")])
head(mkt_pts)
## Market Latitude Longitude
## 1 Arusha -3.36968 36.68808
## 2 Temeke -6.85097 39.25672
## 3 Majengo -6.17971 35.74109
## 4 Kibaigwa -6.08110 36.64645
## 5 Bukoba -1.33140 31.81293
## 6 Babati -4.21006 35.74915
# create reference raster
tza_extent <- ext(tza1) |> floor()
r <- crop(rast(res=1/12), tza_extent)
## Interpolate
#xy <- as.matrix(mypts[,c("Longitude", "Latitude")])
xy <- geom(mypts)[,c("y","x")]
#tps <- Tps(xy, p$spatial)
tps <- Tps(xy, mypts$pkg)
sp <- interpolate(r, tps)
sp <- mask(sp, tza1)
plot(sp)
lines(tza1)
## Predict Maize prices with coodinates only
maize_mypts <- mypts[mypts$Crop == "Maize", ]
rf <- randomForest(pkg ~ Longitude + Latitude , data=maize_mypts)
sp3 <- interpolate(r, rf, xyNames=c("Longitude", "Latitude"))
sp3 <- mask(sp3, tza1)
plot(sp3)
lines(tza1)
The covariates used include a mix of crop-specific indicators, temporal variables to capture monthly and yearly effects, geographical coordinates, accessibility measures, climatic conditions, and lagged rainfall to account for delayed effects of weather on crop prices.
ttcity <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/travel/travel_time_to_cities_u5.tif")
ttport <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/travel/travel_time_to_ports_1.tif")
clm <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/wc2.1_country/TZA_wc2.1_30s_bio.tif")
area <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/spam/spam2017V2r1_SSA_H_MAIZ_A.tif")
yield <- rast("H:/Tanzania Price data/Datasets/geodata/TRUE/spam/spam2017V2r1_SSA_Y_MAIZ_R.tif")
popd <- rast("gpw_v4_population_density_rev11_2020_10m.tif")
names(ttcity) <- c("ttcity_u5") ## travel time cities of 100k or more
names(ttport) <- c("ttport_1") ## travel time to major ports
names(clm) <- gsub("wc2.1_30s_", "", names(clm))
names(area) <- c("MAI_ARE") # SPAM maize area 2010
names(yield) <- c("MAI_YLD") # SPAM maize yield 2010
names(popd) <- c("popdens") # GPW4
comment(ttcity) <- "travel time to cities 100k or more"
comment(ttport) <- "travel time to major ports"
comment(popd) <- "population density 2020 (GPW4 @ 10dm)"
comment(clm)[1] <-"BIO1 = Annual Mean Temperature"
comment(clm)[2] <-"BIO2 = Mean Diurnal Range (Mean of monthly (max temp - min temp))"
comment(clm)[3] <-"BIO3 = Isothermality (BIO2/BIO7) (×100)"
comment(clm)[4] <-"BIO4 = Temperature Seasonality (standard deviation ×100)"
comment(clm)[5] <-"BIO5 = Max Temperature of Warmest Month"
comment(clm)[6] <-"BIO6 = Min Temperature of Coldest Month"
comment(clm)[7] <-"BIO7 = Temperature Annual Range (BIO5-BIO6)"
comment(clm)[8] <-"BIO8 = Mean Temperature of Wettest Quarter"
comment(clm)[9] <-"BIO9 = Mean Temperature of Driest Quarter"
comment(clm)[10] <-"BIO10 = Mean Temperature of Warmest Quarter"
comment(clm)[11] <-"BIO11 = Mean Temperature of Coldest Quarter"
comment(clm)[12] <-"BIO12 = Annual Precipitation"
comment(clm)[13] <-"BIO13 = Precipitation of Wettest Month"
comment(clm)[14] <-"BIO14 = Precipitation of Driest Month"
comment(clm)[15] <-"BIO15 = Precipitation Seasonality (Coefficient of Variation)"
comment(clm)[16] <-"BIO16 = Precipitation of Wettest Quarter"
comment(clm)[17] <-"BIO17 = Precipitation of Driest Quarter"
comment(clm)[18] <-"BIO18 = Precipitation of Warmest Quarter"
comment(clm)[19] <-"BIO19 = Precipitation of Coldest Quarter"
ttcity <- resample(ttcity, r)
ttport <- resample(ttport, r)
clm <- resample(clm, r)
area <- resample(area, r)
popd <- resample(popd, r)
freq(is.na(area))
## layer value count
## 1 1 0 14393
## 2 1 1 6343
area <- classify(area, cbind(NA,0))
yield <- resample(yield, r)
freq(is.na(yield))
## layer value count
## 1 1 0 14393
## 2 1 1 6343
yield <- classify(yield, cbind(NA,0))
# check again
compareGeom(ttcity, ttport, clm, area, yield, popd)
## [1] TRUE
latgrd <- longrd <- r
latgrd[] <- yFromCell(latgrd, 1:ncell(latgrd))
longrd[] <- xFromCell(longrd, 1:ncell(longrd))
names(latgrd) <- c("latitude")
names(longrd) <- c("longitude")
rstack <- c(ttcity, ttport, clm, area, yield, popd, latgrd, longrd)
names(rstack)
## [1] "ttcity_u5" "ttport_1" "bio_1" "bio_2" "bio_3" "bio_4"
## [7] "bio_5" "bio_6" "bio_7" "bio_8" "bio_9" "bio_10"
## [13] "bio_11" "bio_12" "bio_13" "bio_14" "bio_15" "bio_16"
## [19] "bio_17" "bio_18" "bio_19" "MAI_ARE" "MAI_YLD" "popdens"
## [25] "latitude" "longitude"
# create focal mean to extract from (as alternative to using buffers for extraction, which are not supported in terra)
fm <- focalMat(r, d=0.18, type='circle', fillNA=FALSE)
rstack2 <- focal(rstack, w=fm, fun="mean", na.policy="all", fillvalue=NA, # na.rm=TRUE,
expand=TRUE, silent=FALSE) #, filename="", overwrite=FALSE)
# extract values to dataset -- use a 20km buffer
# do a focal sum of 20km radius - this is about 0.18 of a decimal degree... 0.18*112=20.16
fm <- focalMat(r, d=0.18, type='circle', fillNA=FALSE)
rstack2 <- focal(rstack2, w=fm, fun="sum", na.policy="all", fillvalue=NA, na.rm=TRUE,
expand=TRUE, silent=FALSE) #, filename="", overwrite=FALSE)
chirps_path <- "H:/Tanzania Price data/chirps_data"
chirps_files <- list.files(chirps_path, pattern = ".tif$", full.names = TRUE)
# Read all CHIRPS data files into a SpatRaster collection
chirps_rasters <- rast(chirps_files)
#crop to Tanzania boundary
Chirps_Tz <- crop(chirps_rasters, tza1)
writeRaster(Chirps_Tz, "Tz_chirps_monthly_croped.tif", overwrite=TRUE)
Tz_chirps_monthly <- terra::rast("Tz_chirps_monthly_croped.tif")
Tz_chirps_monthly
## class : SpatRaster
## dimensions : 215, 222, 50 (nrow, ncol, nlyr)
## resolution : 0.05, 0.05 (x, y)
## extent : 29.35, 40.45, -11.75, -1.000001 (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326)
## source : Tz_chirps_monthly_croped.tif
## names : chirp~20.11, chirp~20.12, chirp~21.01, chirp~21.02, chirp~21.03, chirp~21.04, ...
## min values : -9999.0000, -9999.0000, -9999.000, -9999.0000, -9999.0000, -9999.0000, ...
## max values : 459.7112, 504.3329, 508.448, 585.3241, 750.1949, 960.3376, ...
#Replace -9999 with NA
Tz_chirps_monthly <- classify(Tz_chirps_monthly, cbind(-9999,NA))
#extract layer names
layer_names <- names(Tz_chirps_monthly)
layer_names
## [1] "chirps-v2.0.2020.11" "chirps-v2.0.2020.12" "chirps-v2.0.2021.01"
## [4] "chirps-v2.0.2021.02" "chirps-v2.0.2021.03" "chirps-v2.0.2021.04"
## [7] "chirps-v2.0.2021.05" "chirps-v2.0.2021.06" "chirps-v2.0.2021.07"
## [10] "chirps-v2.0.2021.08" "chirps-v2.0.2021.09" "chirps-v2.0.2021.10"
## [13] "chirps-v2.0.2021.11" "chirps-v2.0.2021.12" "chirps-v2.0.2022.01"
## [16] "chirps-v2.0.2022.02" "chirps-v2.0.2022.03" "chirps-v2.0.2022.04"
## [19] "chirps-v2.0.2022.05" "chirps-v2.0.2022.06" "chirps-v2.0.2022.07"
## [22] "chirps-v2.0.2022.08" "chirps-v2.0.2022.09" "chirps-v2.0.2022.10"
## [25] "chirps-v2.0.2022.11" "chirps-v2.0.2022.12" "chirps-v2.0.2023.01"
## [28] "chirps-v2.0.2023.02" "chirps-v2.0.2023.03" "chirps-v2.0.2023.04"
## [31] "chirps-v2.0.2023.05" "chirps-v2.0.2023.06" "chirps-v2.0.2023.07"
## [34] "chirps-v2.0.2023.08" "chirps-v2.0.2023.09" "chirps-v2.0.2023.10"
## [37] "chirps-v2.0.2023.11" "chirps-v2.0.2023.12" "chirps-v2.0.2024.01"
## [40] "chirps-v2.0.2024.02" "chirps-v2.0.2024.03" "chirps-v2.0.2024.04"
## [43] "chirps-v2.0.2024.05" "chirps-v2.0.2024.06" "chirps-v2.0.2024.07"
## [46] "chirps-v2.0.2024.08" "chirps-v2.0.2024.09" "chirps-v2.0.2024.10"
## [49] "chirps-v2.0.2024.11" "chirps-v2.0.2024.12"
# We need to create a sequence of dates from the layer names
# Extract year and month from layer names and convert to Date
dates <- as.Date(paste0(sub("chirps-v2.0\\.", "", layer_names), "-01"), format = "%Y.%m-%d")
dates
## [1] "2020-11-01" "2020-12-01" "2021-01-01" "2021-02-01" "2021-03-01"
## [6] "2021-04-01" "2021-05-01" "2021-06-01" "2021-07-01" "2021-08-01"
## [11] "2021-09-01" "2021-10-01" "2021-11-01" "2021-12-01" "2022-01-01"
## [16] "2022-02-01" "2022-03-01" "2022-04-01" "2022-05-01" "2022-06-01"
## [21] "2022-07-01" "2022-08-01" "2022-09-01" "2022-10-01" "2022-11-01"
## [26] "2022-12-01" "2023-01-01" "2023-02-01" "2023-03-01" "2023-04-01"
## [31] "2023-05-01" "2023-06-01" "2023-07-01" "2023-08-01" "2023-09-01"
## [36] "2023-10-01" "2023-11-01" "2023-12-01" "2024-01-01" "2024-02-01"
## [41] "2024-03-01" "2024-04-01" "2024-05-01" "2024-06-01" "2024-07-01"
## [46] "2024-08-01" "2024-09-01" "2024-10-01" "2024-11-01" "2024-12-01"
# Assign these dates to the SpatRaster object
time(Tz_chirps_monthly) <- dates
#rename the layers to the formatted dates
names(Tz_chirps_monthly) <- dates
# Check the SpatRaster object
print(Tz_chirps_monthly)
## class : SpatRaster
## dimensions : 215, 222, 50 (nrow, ncol, nlyr)
## resolution : 0.05, 0.05 (x, y)
## extent : 29.35, 40.45, -11.75, -1.000001 (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326)
## source(s) : memory
## varname : Tz_chirps_monthly_croped
## names : 2020-11-01, 2020-12-01, 2021-01-01, 2021-02-01, 2021-03-01, 2021-04-01, ...
## min values : 5.859746, 1.903993, 0.8161677, 0.3187093, 0.9539621, 1.17571, ...
## max values : 459.711212, 504.332947, 508.4479980, 585.3240967, 750.1949463, 960.33765, ...
## time (days) : 2020-11-01 to 2024-12-01
# do a focal mean of 100km radius - this is about 0.9 of a decimal degree... 0.9009*112=100.9008
# Calculate the focal mean for each layer (month)
fm_r <- focalMat(Tz_chirps_monthly, d=0.9, type='circle', fillNA=FALSE)
Rainfall_focal_sum_100km <- focal(Tz_chirps_monthly, w=fm_r, fun="mean", na.policy="all", fillvalue=NA, na.rm=TRUE,
expand=TRUE, silent=FALSE)
# Check the result
Rainfall_focal_sum_100km
## class : SpatRaster
## dimensions : 215, 222, 50 (nrow, ncol, nlyr)
## resolution : 0.05, 0.05 (x, y)
## extent : 29.35, 40.45, -11.75, -1.000001 (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326)
## source(s) : memory
## varname : Tz_chirps_monthly_croped
## names : 2020-11-01, 2020-12-01, 2021-01-01, 2021-02-01, 2021-03-01, 2021-04-01, ...
## min values : 20.79649, 27.27837, 8.092316, 4.489223, 11.30184, 15.74836, ...
## max values : 313.14248, 326.88227, 416.590468, 395.422070, 464.67840, 569.81990, ...
## time (days) : 2020-11-01 to 2024-12-01
Rainfall <- Rainfall_focal_sum_100km
#Resample
Rainfall_res <- resample(Rainfall, r)
Rainfall_res
## class : SpatRaster
## dimensions : 144, 144, 50 (nrow, ncol, nlyr)
## resolution : 0.08333333, 0.08333333 (x, y)
## extent : 29, 41, -12, 0 (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326)
## source(s) : memory
## names : 2020-11-01, 2020-12-01, 2021-01-01, 2021-02-01, 2021-03-01, 2021-04-01, ...
## min values : 20.89972, 32.9472, 8.116516, 4.527868, 11.83634, 15.90436, ...
## max values : 310.41034, 326.4339, 416.389160, 395.367279, 462.19266, 550.26270, ...
## time (days) : 2020-11-01 to 2024-12-01
Here we define function that calculates 6 month lag sum of rainfall for each month in our data. The output is raster datsets.
calculate_lagged_sum <- function(raster_stack, num_months = 6) {
# Get the time vector from the raster stack
time_vector <- time(raster_stack)
# Initialize list to store lagged sum rasters
lagged_sum_rasters <- vector("list", length(time_vector))
# Loop through each layer in the raster stack
for (i in seq_along(time_vector)) {
if (i > num_months) { # We need at least 'num_months' previous layers to calculate the lagged sum
# Determine the start and end dates for the lag period
end_date <- time_vector[i] # Date of the current layer being processed
start_date <- end_date %m-% months(num_months) #The date num_months before the end_date
# Select the layers that fall within the lag period
lag_period_layers <- raster_stack[[which(time_vector > start_date & time_vector <= end_date)]]
# Calculate the sum of the selected layers
if (nlyr(lag_period_layers) == num_months) {
lagged_sum_rasters[[i]] <- sum(lag_period_layers, na.rm = TRUE)
} else {
lagged_sum_rasters[[i]] <- rast(nrow = nrow(raster_stack), ncol = ncol(raster_stack),
crs = crs(raster_stack), ext = ext(raster_stack),
vals = NA) # Use an empty raster with NA values
}
} else {
lagged_sum_rasters[[i]] <- rast(nrow = nrow(raster_stack), ncol = ncol(raster_stack),
crs = crs(raster_stack), ext = ext(raster_stack),
vals = NA) # Use an empty raster with NA values
}
}
# Combine the lagged sum rasters into a single raster stack, excluding empty rasters
lagged_sum_stack <- rast(lagged_sum_rasters)
# Set the names for the layers in the lagged sum stack
names(lagged_sum_stack) <- names(raster_stack)[!is.na(lagged_sum_rasters)]
return(lagged_sum_stack)
}
# Calculate the 6-month lagged sum for each period in the raster stack
lagged_rainfall_sum <- calculate_lagged_sum(Rainfall_res, num_months = 6)
# Remove the first 6 layers from the raster stack since they are empty
lagged_rainfall_sum_filtered <- lagged_rainfall_sum[[7:nlyr(lagged_rainfall_sum)]]
# check the result
print(lagged_rainfall_sum_filtered)
## class : SpatRaster
## dimensions : 144, 144, 44 (nrow, ncol, nlyr)
## resolution : 0.08333333, 0.08333333 (x, y)
## extent : 29, 41, -12, 0 (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326)
## source(s) : memory
## names : 2021-05-01, 2021-06-01, 2021-07-01, 2021-08-01, 2021-09-01, 2021-10-01, ...
## min values : 172.7856, 105.5447, 105.4121, 105.1864, 26.20571, 7.070483, ...
## max values : 1512.4689, 1355.9669, 1052.9220, 1051.9214, 1038.13874, 643.716021, ...
names(lagged_rainfall_sum_filtered) <- paste0(names(lagged_rainfall_sum_filtered), "_rain.sum.lag")
plot(lagged_rainfall_sum_filtered)
## We'll have to include lagged_rainfall_sum_filtered in the predictor stack.
rstack
## class : SpatRaster
## dimensions : 144, 144, 26 (nrow, ncol, nlyr)
## resolution : 0.08333333, 0.08333333 (x, y)
## extent : 29, 41, -12, 0 (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326)
## source(s) : memory
## names : ttcity_u5, ttport_1, bio_1, bio_2, bio_3, bio_4, ...
## min values : 3.350081e-02, 997.9427, 4.333545, 6.546645, 53.85881, 19.75255, ...
## max values : 1.166560e+03, 3043.3459, 28.472235, 15.285786, 93.14099, 266.33594, ...
#names(rstack)
rstack3 <- c(rstack, lagged_rainfall_sum_filtered)
names(rstack3)
## [1] "ttcity_u5" "ttport_1"
## [3] "bio_1" "bio_2"
## [5] "bio_3" "bio_4"
## [7] "bio_5" "bio_6"
## [9] "bio_7" "bio_8"
## [11] "bio_9" "bio_10"
## [13] "bio_11" "bio_12"
## [15] "bio_13" "bio_14"
## [17] "bio_15" "bio_16"
## [19] "bio_17" "bio_18"
## [21] "bio_19" "MAI_ARE"
## [23] "MAI_YLD" "popdens"
## [25] "latitude" "longitude"
## [27] "2021-05-01_rain.sum.lag" "2021-06-01_rain.sum.lag"
## [29] "2021-07-01_rain.sum.lag" "2021-08-01_rain.sum.lag"
## [31] "2021-09-01_rain.sum.lag" "2021-10-01_rain.sum.lag"
## [33] "2021-11-01_rain.sum.lag" "2021-12-01_rain.sum.lag"
## [35] "2022-01-01_rain.sum.lag" "2022-02-01_rain.sum.lag"
## [37] "2022-03-01_rain.sum.lag" "2022-04-01_rain.sum.lag"
## [39] "2022-05-01_rain.sum.lag" "2022-06-01_rain.sum.lag"
## [41] "2022-07-01_rain.sum.lag" "2022-08-01_rain.sum.lag"
## [43] "2022-09-01_rain.sum.lag" "2022-10-01_rain.sum.lag"
## [45] "2022-11-01_rain.sum.lag" "2022-12-01_rain.sum.lag"
## [47] "2023-01-01_rain.sum.lag" "2023-02-01_rain.sum.lag"
## [49] "2023-03-01_rain.sum.lag" "2023-04-01_rain.sum.lag"
## [51] "2023-05-01_rain.sum.lag" "2023-06-01_rain.sum.lag"
## [53] "2023-07-01_rain.sum.lag" "2023-08-01_rain.sum.lag"
## [55] "2023-09-01_rain.sum.lag" "2023-10-01_rain.sum.lag"
## [57] "2023-11-01_rain.sum.lag" "2023-12-01_rain.sum.lag"
## [59] "2024-01-01_rain.sum.lag" "2024-02-01_rain.sum.lag"
## [61] "2024-03-01_rain.sum.lag" "2024-04-01_rain.sum.lag"
## [63] "2024-05-01_rain.sum.lag" "2024-06-01_rain.sum.lag"
## [65] "2024-07-01_rain.sum.lag" "2024-08-01_rain.sum.lag"
## [67] "2024-09-01_rain.sum.lag" "2024-10-01_rain.sum.lag"
## [69] "2024-11-01_rain.sum.lag" "2024-12-01_rain.sum.lag"
#Extract to the point dataset
extr1 <- terra::extract(rstack3, mypts, method = "bilinear")
mypts <- cbind(mypts, extr1)
# Remove the ID column from the dataset
mypts <- mypts[, !names(mypts) %in% "ID"]
Here we extract sum of lag rainfall for each row under the column rain.sum.lag
mypts_df <- as.data.frame(mypts)
# Define the function to obtain sum of lag rainfall from corresponding rasters to mypts under rain.sum.lag column (for each row)
# Each extraction has to match the month and year
get_rain_sum_row <- function(current_date, mypts_row) {
# Extract the rainfall value for the current date
rain_sum <- mypts_row[[paste0(current_date, "_rain.sum.lag")]]
return(rain_sum)
}
# Loop through each row and obtain the rainfall sum for each month and year
for (i in 1:nrow(mypts_df)) {
# Extract relevant data for the current row
month <- mypts_df$Month[i]
year <- mypts_df$Year[i]
current_date <- paste0(year, "-", sprintf("%02d", month), "-01") # Format date correctly
# Pass necessary data to the function
rain_sum <- get_rain_sum_row(current_date, mypts_df[i, ])
# Update the rain.sum.lag column
mypts_df$rain.sum.lag[i] <- rain_sum
}
# Update the SpatVector with the new rain.avg column
mypts$rain.sum.lag <- mypts_df$rain.sum.lag
# I'll drop the dates with rain.sum.lag from mypts, seems redundant
column_indices <- grep("^202[0-4]-", names(mypts))
mypts <- mypts[, -column_indices]
names(mypts)
## [1] "Region" "Market" "Month" "Year" "Latitude"
## [6] "Longitude" "Crop" "pkg" "maize" "rice"
## [11] "sorghum" "bmillet" "fmillet" "wheat" "beans"
## [16] "potato" "jan" "feb" "mar" "apr"
## [21] "may" "jun" "jul" "aug" "sep"
## [26] "oct" "nov" "dec" "Region_GADM" "region_match"
## [31] "ttcity_u5" "ttport_1" "bio_1" "bio_2" "bio_3"
## [36] "bio_4" "bio_5" "bio_6" "bio_7" "bio_8"
## [41] "bio_9" "bio_10" "bio_11" "bio_12" "bio_13"
## [46] "bio_14" "bio_15" "bio_16" "bio_17" "bio_18"
## [51] "bio_19" "MAI_ARE" "MAI_YLD" "popdens" "latitude"
## [56] "longitude" "rain.sum.lag"
#define Month as a factor
mypts$Month <- as.factor(mypts$Month)
levels(mypts$Month)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12"
#We'll define month as an interger instead.
# Check to make sure Month is interger
sapply(mypts, class)
## Region Market Month Year Latitude Longitude
## "character" "character" "factor" "integer" "numeric" "numeric"
## Crop pkg maize rice sorghum bmillet
## "factor" "numeric" "numeric" "numeric" "numeric" "numeric"
## fmillet wheat beans potato jan feb
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## mar apr may jun jul aug
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## sep oct nov dec Region_GADM region_match
## "numeric" "numeric" "numeric" "numeric" "character" "logical"
## ttcity_u5 ttport_1 bio_1 bio_2 bio_3 bio_4
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## bio_5 bio_6 bio_7 bio_8 bio_9 bio_10
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## bio_11 bio_12 bio_13 bio_14 bio_15 bio_16
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## bio_17 bio_18 bio_19 MAI_ARE MAI_YLD popdens
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## latitude longitude rain.sum.lag
## "numeric" "numeric" "numeric"
# drop levels that don't exist in Crop field
mypts$Crop <- mypts$Crop[,drop=TRUE]
levels(mypts$Crop)
## [1] "Maize" "Rice" "Sorghum" "B.Millet" "F.Millet" "Wheat" "Beans"
## [8] "Potato"
Coefficient estimates from the linear model provide a detailed insight into the relationship between each predictor and the response variable.
# Fit the linear model
lm_model <- lm(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato +
Month +
Year +
ttcity_u5 + ttport_1 +
bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 +
bio_7 + bio_8 + bio_9 + bio_10 + bio_11 + bio_12 + bio_13
+ bio_14 + bio_15 + bio_16 + bio_17 + bio_18 + bio_19 +
MAI_ARE + MAI_YLD +
Longitude + Latitude +
rain.sum.lag,
data = mypts)
summary(lm_model)
##
## Call:
## lm(formula = pkg ~ maize + rice + sorghum + bmillet + fmillet +
## wheat + beans + potato + Month + Year + ttcity_u5 + ttport_1 +
## bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 + bio_7 + bio_8 +
## bio_9 + bio_10 + bio_11 + bio_12 + bio_13 + bio_14 + bio_15 +
## bio_16 + bio_17 + bio_18 + bio_19 + MAI_ARE + MAI_YLD + Longitude +
## Latitude + rain.sum.lag, data = mypts)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1300.3 -255.4 -15.5 246.6 2338.6
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.108e+05 9.987e+03 -31.118 < 2e-16 ***
## maize -1.064e+02 1.752e+01 -6.073 1.32e-09 ***
## rice 1.326e+03 1.750e+01 75.793 < 2e-16 ***
## sorghum 3.764e+02 1.844e+01 20.413 < 2e-16 ***
## bmillet 4.788e+02 1.961e+01 24.419 < 2e-16 ***
## fmillet 7.767e+02 1.817e+01 42.755 < 2e-16 ***
## wheat 9.300e+02 1.982e+01 46.910 < 2e-16 ***
## beans 1.542e+03 1.752e+01 88.035 < 2e-16 ***
## potato NA NA NA NA
## Month2 3.231e+01 2.662e+01 1.214 0.224769
## Month3 1.520e+01 2.733e+01 0.556 0.578147
## Month4 2.216e+01 2.843e+01 0.779 0.435731
## Month5 -4.973e+00 2.677e+01 -0.186 0.852629
## Month6 -2.619e+01 2.531e+01 -1.035 0.300913
## Month7 -8.210e+01 2.469e+01 -3.325 0.000889 ***
## Month8 -9.831e+01 2.451e+01 -4.010 6.13e-05 ***
## Month9 -9.179e+01 2.508e+01 -3.660 0.000254 ***
## Month10 -8.076e+01 2.648e+01 -3.050 0.002295 **
## Month11 -2.946e+01 2.595e+01 -1.136 0.256158
## Month12 2.095e+01 2.451e+01 0.855 0.392613
## Year 1.471e+02 4.716e+00 31.200 < 2e-16 ***
## ttcity_u5 1.754e+00 1.544e-01 11.363 < 2e-16 ***
## ttport_1 -1.733e+00 1.688e-01 -10.268 < 2e-16 ***
## bio_1 -2.260e+03 1.834e+02 -12.328 < 2e-16 ***
## bio_2 -2.090e+03 1.853e+02 -11.275 < 2e-16 ***
## bio_3 1.900e+02 1.629e+01 11.668 < 2e-16 ***
## bio_4 1.276e+01 3.644e+00 3.502 0.000465 ***
## bio_5 1.588e+03 1.456e+02 10.907 < 2e-16 ***
## bio_6 -1.479e+03 1.457e+02 -10.154 < 2e-16 ***
## bio_7 NA NA NA NA
## bio_8 8.305e+02 8.056e+01 10.309 < 2e-16 ***
## bio_9 -7.776e+01 6.681e+01 -1.164 0.244520
## bio_10 -6.692e+02 1.732e+02 -3.863 0.000113 ***
## bio_11 2.101e+03 2.549e+02 8.241 < 2e-16 ***
## bio_12 1.122e+00 2.561e-01 4.379 1.21e-05 ***
## bio_13 9.079e-01 8.328e-01 1.090 0.275653
## bio_14 3.925e+01 1.083e+01 3.624 0.000292 ***
## bio_15 1.667e+01 1.952e+00 8.539 < 2e-16 ***
## bio_16 -1.489e+00 5.990e-01 -2.486 0.012926 *
## bio_17 -1.126e+01 3.015e+00 -3.736 0.000189 ***
## bio_18 -1.227e-01 2.161e-01 -0.568 0.570062
## bio_19 2.668e+00 2.330e+00 1.145 0.252291
## MAI_ARE 2.704e-02 2.328e-02 1.161 0.245511
## MAI_YLD 9.918e-02 1.268e-02 7.820 6.07e-15 ***
## Longitude 5.033e+01 2.634e+01 1.911 0.056087 .
## Latitude -3.612e+01 2.655e+01 -1.360 0.173764
## rain.sum.lag -1.612e-01 2.621e-02 -6.151 8.12e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 390.6 on 7013 degrees of freedom
## Multiple R-squared: 0.7194, Adjusted R-squared: 0.7177
## F-statistic: 408.7 on 44 and 7013 DF, p-value: < 2.2e-16
First check if there are any predictors with NA values
for(column in seq_along(mypts)){
if(any(is.na(mypts[column]))){
print(paste0("Column: ", colnames(mypts)[column], " has at least one NA value"))
}
}
#There are no columns with missing values
Data from May 2021 - Dec 2023 will be used for model training while more recent data from Jan - June 2024 will be used for Validation.
mypts <- as.data.frame(mypts)
# Filter the data for training (May 2021 - Dec 2023)
training_data <- mypts[mypts$Year %in% c(2021, 2022, 2023), ]
# Check training data
#head(training_data)
# Filter the data for validation (Jan 2024 - June 2024)
validation_data <- mypts[mypts$Year == 2024, ]
# Check validation data
#head(validation_data)
The tuneRF function in the randomForest package is used to tune the mtry parameter, which is the number of variables randomly sampled as candidates at each split in the random forest. The function requires a data frame of predictor variables and a response variable.
# Convert training_data data to data frame
mypts_df <- as.data.frame(training_data)
trf <- tuneRF(x=mypts_df[,1:ncol(mypts_df)], # Prediction variables
y=mypts_df$pkg) # Response variable
## mtry = 19 OOB error = 1822.457
## Searching left ...
## mtry = 10 OOB error = 7409.326
## -3.065571 0.05
## Searching right ...
## mtry = 38 OOB error = 181.6615
## 0.9003205 0.05
## mtry = 57 OOB error = 140.1176
## 0.2286886 0.05
Based on the output from tuneRF, you can observe that the mtry value that gives the lowest Out-of-Bag (OOB) error. To build the first random forest model, we will use this mtry value.
(mintree <- trf[which.min(trf[,2]),1])
## [1] 57
Here, the model is fitted using the randomForest function to build a predictive model for food commodity prices. The model takes the response variable, the prediction variables and the optimal number of variables to consider at each split. The goal is to generate Variable importance scores which will help us understand which variables have the most significant impact on the response variable, enabling us to interpret the model and possibly simplify it by focusing on the most important predictors.
# Create the random forest model
rf1 <- randomForest(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato +
Month +
Year +
ttcity_u5 + ttport_1 +
bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 +
bio_7 + bio_8 + bio_9 + bio_10 + bio_11 + bio_12 +
bio_13 + bio_14 + bio_15 + bio_16 + bio_17 +
bio_18 + bio_19 + MAI_ARE + MAI_YLD +
Longitude + Latitude +
rain.sum.lag,
data = training_data,mtry=mintree,
importance=TRUE,na.rm=TRUE)
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
rf1
##
## Call:
## randomForest(formula = pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato + Month + Year + ttcity_u5 + ttport_1 + bio_1 + bio_2 + bio_3 + bio_4 + bio_5 + bio_6 + bio_7 + bio_8 + bio_9 + bio_10 + bio_11 + bio_12 + bio_13 + bio_14 + bio_15 + bio_16 + bio_17 + bio_18 + bio_19 + MAI_ARE + MAI_YLD + Longitude + Latitude + rain.sum.lag, data = training_data, mtry = mintree, importance = TRUE, na.rm = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 36
##
## Mean of squared residuals: 32599
## % Var explained: 93.75
varImpPlot(rf1)
## evaluate
(oob <- sqrt(rf1$mse[which.min(rf1$mse)]))
## [1] 180.4739
This calculates the RMSE of the tree in the Random Forest model that has the lowest OOB error.
importance_metrics <- importance(rf1, type=1) # %IncMSE
impvar <- rownames(importance_metrics)[order(importance_metrics[, 1], decreasing=TRUE)]
# Get the top 20 variables
top_20_vars <- impvar[1:20]
top_20_vars
## [1] "Year" "Month" "rice" "beans" "Longitude"
## [6] "sorghum" "bmillet" "fmillet" "bio_12" "wheat"
## [11] "rain.sum.lag" "bio_18" "bio_15" "MAI_YLD" "Latitude"
## [16] "bio_3" "MAI_ARE" "ttport_1" "ttcity_u5" "bio_4"
node_purity <- importance(rf1, type=2) # IncNodePurity
# Sort variables by importance (IncNodePurity)
node_purity_sorted <- sort(node_purity[,1], decreasing = TRUE)
# Select the top 20 important variables
top_vars <- names(node_purity_sorted)[1:20]
print(top_vars)
## [1] "rice" "beans" "Year" "Month" "wheat"
## [6] "fmillet" "potato" "maize" "bio_12" "Longitude"
## [11] "sorghum" "bmillet" "rain.sum.lag" "bio_3" "MAI_YLD"
## [16] "bio_7" "bio_18" "ttcity_u5" "Latitude" "MAI_ARE"
rf1$importanceSD
## maize rice sorghum bmillet fmillet wheat
## 1913.5146 2855.2532 448.6135 602.5770 1739.7800 1644.9452
## beans potato Month Year ttcity_u5 ttport_1
## 3133.3086 1947.3786 283.2314 548.9603 250.6585 203.2343
## bio_1 bio_2 bio_3 bio_4 bio_5 bio_6
## 115.9734 228.3560 531.5079 164.3543 162.1760 263.5565
## bio_7 bio_8 bio_9 bio_10 bio_11 bio_12
## 644.4759 129.7121 230.1039 149.2550 125.0152 764.9873
## bio_13 bio_14 bio_15 bio_16 bio_17 bio_18
## 135.7562 128.3871 256.0899 154.1675 111.2669 251.7374
## bio_19 MAI_ARE MAI_YLD Longitude Latitude rain.sum.lag
## 182.5224 245.5736 474.0162 336.8559 253.7663 157.0253
In this section, we aim to refine our model by selecting the most important variables. We will review the importance metrics (%IncMSE and IncNodePurity) to identify the variables that contribute the most to the model’s predictive power. Our focus will be on variables with higher importance values to ensure a more efficient and interpretable model.
# Estimate more parsimonious specification
rf <- randomForest(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato +
Month +
Year +
ttport_1 +
ttcity_u5 +
popdens +
bio_3 + bio_6 + bio_9 + bio_12 + bio_18 +
rain.sum.lag,
data=training_data, na.rm=TRUE)
rf
##
## Call:
## randomForest(formula = pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato + Month + Year + ttport_1 + ttcity_u5 + popdens + bio_3 + bio_6 + bio_9 + bio_12 + bio_18 + rain.sum.lag, data = training_data, na.rm = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 6
##
## Mean of squared residuals: 35964.83
## % Var explained: 93.1
# evaluate
varImpPlot(rf)
(oob <- sqrt(rf$mse[which.min(rf$mse)]))
## [1] 189.4732
partialPlot(rf, as.data.frame(training_data), "rain.sum.lag")
These are prediction plots for each food commodities (maize, beans, rice, sorghum, bmillet, fmillet, wheat, potato) with their respective month of the year 2023.
year <- 2024
Note: we must set the rain.sum.lag variables for each month we are predicting
#Maize
# Create an empty list to store predictions for maize
predict_for_month <- function(month){
rain_sum_lag <- rstack3[paste0("2024-", sprintf("%02d", month), "-01_rain.sum.lag")] # Remember to change depending on year
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_maize <- data.frame(
maize = 1, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 0,
Month = factor(month, levels = levels(training_data$Month)), # match factor levels
Year = year
)
pred <- predict(newstack, rf, const = const_maize, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_maize <- lapply(1:12, predict_for_month)
# Extract pixel values from predictions_maize
maize_values <- unlist(lapply(predictions_maize, values))
# Get min and max values
min_maize <- min(maize_values, na.rm = TRUE)
max_maize <- max(maize_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- colorRampPalette(c("blue", "wheat", "red"))(100)
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
break_interval <- 100
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
break_interval <- 100
# Create a 3x4 matrix of plots
par(mar = c(0, 0, 0, 0)) # Set margins to 0 for inner plots
for (i in 1:12) {
plot(predictions_maize[[i]], main = paste("Maize prices", toupper(i), year),
zlim = c(min_maize, max_maize), col = color_palette, breaks = seq(min_maize, max_maize, by = break_interval), legend = FALSE, axes = FALSE)
# Add region boundaries
plot(tza1, add = TRUE, border = "black", lwd = 0.1)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_maize, max_maize), n = 4)
# Reset plot layout for the legend
layout(matrix(1))
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_maize, max_maize), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Maize Price (TZS Per Kg)", side = 1, line = 2, cex = 0.9))
# Beans
# Function to predict beans for a given month
predict_for_beans <- function(month) {
rain_sum_lag <- rstack3[paste0("2024-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_beans <- data.frame(
maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 1, potato = 0,
Month = factor(month, levels = levels(training_data$Month)), # match factor levels
Year = year
)
pred <- predict(newstack, rf, const = const_beans, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_beans <- lapply(1:12, predict_for_beans)
# Extract pixel values from predictions_beans
bean_values <- unlist(lapply(predictions_beans, values))
min_bean <- min(bean_values, na.rm = TRUE)
max_bean <- max(bean_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- colorRampPalette(c("blue", "wheat", "red"))(100)
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
break_interval <- 150
# Loop through each month to plot beans prices
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
plot(predictions_beans[[i]], main = paste("Beans prices", toupper(i), year),
zlim = c(min_bean, max_bean), col = color_palette, breaks = seq(min_bean, max_bean, by = break_interval), legend = FALSE, axes = FALSE)
# Add region boundaries
plot(tza1, add = TRUE, border = "black", lwd = 0.1)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_bean, max_bean), n = 4)
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_bean, max_bean), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Beans Price (Tsh) Per Kg", side = 1, line = 2, cex = 0.9))
# Rice
# Function to predict rice prices for a given month
predict_for_rice <- function(month) {
rain_sum_lag <- rstack3[paste0("2024-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_rice <- data.frame(
maize = 0, rice = 1, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 0,
Month = factor(month, levels = levels(training_data$Month)), # match factor levels
Year = year
)
pred <- predict(newstack, rf, const = const_rice, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_rice <- lapply(1:12, predict_for_rice)
# Extract pixel values from predictions_rice
rice_values <- unlist(lapply(predictions_rice, values))
min_rice <- min(rice_values, na.rm = TRUE)
max_rice <- max(rice_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- colorRampPalette(c("blue", "wheat", "red"))(100)
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
break_interval <- 150
# Loop through each month to plot rice prices
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
plot(predictions_rice[[i]], main = paste("Rice prices", toupper(i), year),
zlim = c(min_rice, max_rice), col = color_palette, breaks = seq(min_rice, max_rice, by = break_interval), legend = FALSE, axes = FALSE)
# Add region boundaries
plot(tza1, add = TRUE, border = "black", lwd = 0.1)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_rice, max_rice), n = 5)
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_rice, max_rice), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Rice Price (Tsh) Per Kg", side = 1, line = 2, cex = 0.9))
# Function to predict sorghum prices for a given month
predict_for_sorghum <- function(month) {
rain_sum_lag <- rstack3[paste0("2024-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_sorghum <- data.frame(
maize = 0, rice = 0, sorghum = 1, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 0,
Month = factor(month, levels = levels(training_data$Month)), # match factor levels
Year = year
)
pred <- predict(newstack, rf, const = const_sorghum, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_sorghum <- lapply(1:12, predict_for_sorghum)
# Extract pixel values from predictions_sorghum
sorghum_values <- unlist(lapply(predictions_sorghum, values))
min_sorghum <- min(sorghum_values, na.rm = TRUE)
max_sorghum <- max(sorghum_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- colorRampPalette(c("blue", "wheat", "red"))(100)
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
break_interval <- 300
# Loop through each month to plot sorghum prices
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
plot(predictions_sorghum[[i]], main = paste("Sorghum prices", toupper(i), year),
zlim = c(min_sorghum, max_sorghum), col = color_palette, breaks = seq(min_sorghum, max_sorghum, by = break_interval), legend = FALSE, axes = FALSE)
# Add region boundaries
plot(tza1, add = TRUE, border = "black", lwd = 0.1)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_sorghum, max_sorghum), n = 5)
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_sorghum, max_sorghum), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Sorghum Price (Tsh) Per Kg", side = 1, line = 2, cex = 0.9))
# bmillet
# Function to predict bmillet prices for a given month
predict_for_bmillet <- function(month) {
rain_sum_lag <- rstack3[paste0("2024-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_bmillet <- data.frame(
maize = 0, rice = 0, sorghum = 0, bmillet = 1, fmillet = 0, wheat = 0, beans = 0, potato = 0,
Month = factor(month, levels = levels(training_data$Month)), # match factor levels
Year = year
)
pred <- predict(newstack, rf, const = const_bmillet, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_bmillet <- lapply(1:12, predict_for_bmillet)
# Extract pixel values from predictions_bmillet
bmillet_values <- unlist(lapply(predictions_bmillet, values))
min_bmillet <- min(bmillet_values, na.rm = TRUE)
max_bmillet <- max(bmillet_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- colorRampPalette(c("blue", "wheat", "red"))(100)
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout with an extra row for the legend
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
# Loop through each month to plot bmillet prices
break_interval <- 150
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
plot(predictions_bmillet[[i]], main = paste("Bmillet prices", toupper(i), year),
zlim = c(min_bmillet, max_bmillet), col = color_palette,
breaks = seq(min_bmillet, max_bmillet, by = break_interval),
legend = FALSE, axes = FALSE)
# Add region boundaries
plot(tza1, add = TRUE, border = "black", lwd = 0.1)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_bmillet, max_bmillet), n = 5)
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_bmillet, max_bmillet), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Bulrush Millet Price (Tsh) Per Kg", side = 1, line = 2, cex = 0.9))
#fmillet
# Function to predict fmillet prices for a given month
predict_for_fmillet <- function(month) {
rain_sum_lag <- rstack3[paste0("2024-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_fmillet <- data.frame(
maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 1, wheat = 0, beans = 0, potato = 0,
Month = factor(month, levels = levels(training_data$Month)), # match factor levels
Year = year
)
pred <- predict(newstack, rf, const = const_fmillet, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_fmillet <- lapply(1:12, predict_for_fmillet)
# Extract pixel values from predictions_fmillet
fmillet_values <- unlist(lapply(predictions_fmillet, values))
min_fmillet <- min(fmillet_values, na.rm = TRUE)
max_fmillet <- max(fmillet_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- colorRampPalette(c("blue", "wheat", "red"))(100)
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout with an extra row for the legend
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
# Loop through each month to plot fmillet prices
break_interval <- 300
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
plot(predictions_fmillet[[i]], main = paste("Fmillet prices", toupper(i), year),
zlim = c(min_fmillet, max_fmillet), col = color_palette,
breaks = seq(min_fmillet, max_fmillet, by = break_interval),
legend = FALSE, axes = FALSE)
# Add region boundaries
plot(tza1, add = TRUE, border = "black", lwd = 0.1)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_fmillet, max_fmillet), n = 3)
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_fmillet, max_fmillet), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Finger Millet Price (Tsh) Per Kg", side = 1, line = 2, cex = 0.9))
#Wheat
# Function to predict wheat prices for a given month
predict_for_wheat <- function(month) {
rain_sum_lag <- rstack3[paste0("2024-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_wheat <- data.frame(
maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 1, beans = 0, potato = 0,
Month = factor(month, levels = levels(training_data$Month)), # match factor levels
Year = year
)
pred <- predict(newstack, rf, const = const_wheat, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_wheat <- lapply(1:12, predict_for_wheat)
# Extract pixel values from predictions_wheat
wheat_values <- unlist(lapply(predictions_wheat, values))
min_wheat <- min(wheat_values, na.rm = TRUE)
max_wheat <- max(wheat_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- colorRampPalette(c("blue", "wheat", "red"))(100)
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
# Define the break interval for both plot and legend
break_interval <- 100
# Loop through each month to plot wheat prices
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
plot(predictions_wheat[[i]], main = paste("Wheat prices", toupper(i), year),
zlim = c(min_wheat, max_wheat), col = color_palette,
breaks = seq(min_wheat, max_wheat, by = break_interval),
legend = FALSE, axes = FALSE)
# Add region boundaries
plot(tza1, add = TRUE, border = "black", lwd = 0.1)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_wheat, max_wheat), n = 5)
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_wheat, max_wheat), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Wheat Price (Tsh) Per Kg", side = 1, line = 2, cex = 0.9))
#potatoes
# Function to predict potato prices for a given month
predict_for_potato <- function(month) {
rain_sum_lag <- rstack3[paste0("2024-", sprintf("%02d", month), "-01_rain.sum.lag")]
names(rain_sum_lag) <- "rain.sum.lag"
newstack <- c(rstack, rain_sum_lag)
const_potato <- data.frame(
maize = 0, rice = 0, sorghum = 0, bmillet = 0, fmillet = 0, wheat = 0, beans = 0, potato = 1,
Month = factor(month, levels = levels(training_data$Month)), # match factor levels
Year = year
)
pred <- predict(newstack, rf, const = const_potato, na.rm = TRUE)
pred <- mask(pred, tza1)
return(pred)
}
# Create predictions for all months
predictions_potato <- lapply(1:12, predict_for_potato)
# Extract pixel values from predictions_potato
potato_values <- unlist(lapply(predictions_potato, values))
min_potato <- min(potato_values, na.rm = TRUE)
max_potato <- max(potato_values, na.rm = TRUE)
# Define the continuous color palette and reverse it
color_palette <- colorRampPalette(c("blue", "wheat", "red"))(100)
# Set up plot layout with an extra row for the legend
layout_matrix <- matrix(c(1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 13, 13, 13), nrow = 4, byrow = TRUE)
# Set up layout
layout(layout_matrix, heights = c(1, 1, 1, 0.5))
# Loop through each month to plot potato prices
break_interval <- 150
par(mar = c(0, 0, 0, 0))
for (i in 1:12) {
plot(predictions_potato[[i]], main = paste("Potato prices", toupper(i), year),
zlim = c(min_potato, max_potato), col = color_palette,
breaks = seq(min_potato, max_potato, by = break_interval),
legend = FALSE, axes = FALSE)
# Add region boundaries
plot(tza1, add = TRUE, border = "black", lwd = 0.1)
points(training_data, pch = 20, col = "red", cex = 0.5)
}
# Generate pretty breaks for the legend
legend_breaks <- pretty(c(min_potato, max_potato), n = 5) # Adjust n as needed
# Reset plot layout to 1x1 for the legend
layout(matrix(1))
# Set margins for the legend
par(mar = c(5, 4, 2, 1))
# Plot the legend in the bottom row, centered horizontally
image.plot(zlim = c(min_potato, max_potato), legend.only = TRUE,
col = color_palette, horizontal = TRUE,
legend.width = 0.7, legend.shrink = 0.9,
axis.args = list(at = legend_breaks, labels = legend_breaks, cex.axis = 0.9),
legend.args = list(text = "Predicted Potato Price (Tsh) Per Kg", side = 1, line = 2, cex = 0.9))
pred<-predict(object=rf, newdata=validation_data)
actual<-validation_data$pkg
result<-data.frame(actual=actual, predicted=pred)
mse <- mean((actual - pred)^2, na.rm=TRUE)
paste('Mean Squared Error:', mse)
## [1] "Mean Squared Error: 165743.622471787"
rmse <- sqrt(mse)
paste('Root Mean Squared error: ',mean(sqrt(rf$mse)))
## [1] "Root Mean Squared error: 194.244666889019"
#Save predicted & observed price
write.csv(result, "result.csv")
#reading result.csv file (predicted vs observed)
rslt <- read.csv("result.csv", header=T)
print(names(rslt))
## [1] "X" "actual" "predicted"
#RMSE predicting from rf - predicited vs observed
rf.rmse<-round(sqrt(mean( (rslt$actual-rslt$predicted)^2 , na.rm = TRUE )),2)
print(rf.rmse)
## [1] 407.12
#R-square
rf.r2<-round(summary(lm(actual~predicted, rslt))$r.squared,2)
print(rf.r2)
## [1] 0.74
range(actual)
## [1] 350 4050
range(pred)
## [1] 640.6711 3243.3731
#plotting predicted Vs observed
ggplot(result, aes(x=actual, y=predicted), alpha=0.6) +
geom_point(colour = "blue", size = 1.4, alpha=0.6) +
ggtitle('Random Forest "Wholesale Grain Prices in Tanzania"') +
scale_x_continuous("Observed Price (Tsh) Per Kg",
limits = c(0, 5000),
breaks = seq(0, 5000, 1000)) +
scale_y_continuous("Predicted Price (Tsh) Per Kg",
limits = c(0, 5000),
breaks = seq(0, 5000, 1000)) +
theme(axis.line = element_line(colour = "black"),
axis.text.y = element_text(size = 8, angle = 90, hjust = 0.5, vjust = 1),
axis.text.x = element_text(size = 8)) +
geom_abline(intercept = 0, slope = 1, linewidth = 0.5) +
geom_smooth(aes(x = actual, y = predicted), formula = y ~ x, method = "lm", se = FALSE, colour = "red", linetype = 2, size = 0.9) +
annotate("text", x = 300, y = 4500, label = paste("RMSE:", rf.rmse)) +
annotate("text", x = 300, y = 4200, label = paste("R^2: ", rf.r2), parse = TRUE)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(stats)
mypts_df$pred <- stats::predict(rf)
rsq <- function (obs, pred) cor(obs, pred, use = 'complete.obs') ^ 2
RMSE <- function(obs, pred){sqrt(mean((pred - obs)^2, na.rm = TRUE))}
fr2_rsq <- rsq(mypts_df$pkg, mypts_df$pred) %>% round(digits = 2)
fr2_rmse <- RMSE(mypts_df$pkg, mypts_df$pred) %>% round(digits = 0)
Price_fit_plot <- ggplot(data = mypts_df, aes(x = pkg, y = pred)) +
geom_point(colour = "blue", size = 1.4 ,alpha=0.6) +
ggtitle('Observed vs Predicted "Wholesale Grain Prices in Tanzania"') +
geom_abline(slope = 1, alpha=0.3) +
annotate('text', x = 150, y = 4500, label = paste0("R^{2}==", fr2_rsq), parse = TRUE, size=3) +
annotate('text', x = 150, y = 4200, label = paste0("RMSE==", fr2_rmse), parse = TRUE, size=3) +
labs(x = "Observed Price (Tsh) Per Kg", y = "Predicted Price (Tsh) Per Kg") +
xlim(0, 5000) + ylim(0, 5000)
Price_fit_plot
Plot Partial dependence of all the variables used except food commodities and months.
library(caret)
var_importance <- varImp(rf)
impvar <- rownames(var_importance)[order(var_importance[, 1], decreasing=TRUE)]
op <- par(mfrow=c(2, 4))
# exclude food commodities and months
predictors_to_plot <- setdiff(impvar, c("maize", "rice", "sorghum", "bmillet", "fmillet", "wheat", "beans", "potato", "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"))
for (i in seq_along(predictors_to_plot)) {
partialPlot(rf, as.data.frame(training_data), predictors_to_plot[i], xlab=predictors_to_plot[i],
main="Partial Dependence")
}
# Compute descriptive statistics for each crop
stats <- mypts %>%
filter(pkg > 0) %>%
group_by(Crop) %>%
summarise(
Mean = mean(pkg, na.rm = TRUE),
Median = median(pkg, na.rm = TRUE),
Minimum = min(pkg, na.rm = TRUE),
Maximum = max(pkg, na.rm = TRUE),
Std_Dev = sd(pkg, na.rm = TRUE),
IQR = IQR(pkg, na.rm = TRUE),
Observations = n()
) %>%
arrange(Crop)
print(stats)
## # A tibble: 8 × 8
## Crop Mean Median Minimum Maximum Std_Dev IQR Observations
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 Maize 770. 721. 268. 1722. 261. 414. 1006
## 2 Rice 2206. 2200 800 3518. 559. 886. 1010
## 3 Sorghum 1269. 1257. 420 3250 445. 650 838
## 4 B.Millet 1369. 1260 469. 4000 533. 774. 683
## 5 F.Millet 1655. 1700 700 3000 346. 450 877
## 6 Wheat 1797. 1800 368. 4050 551. 574. 654
## 7 Beans 2421. 2475 985 3850 558. 926. 1006
## 8 Potato 881. 853. 300 2000 261. 258. 984
These are correlation plots for pooled sample in two periods: post-harvest (May-Oct) and lean season (Nov-April).
#We'll use mean Monthly data in wide format
prices.monthly
## Region Market Month Year Latitude Longitude mai.price
## 1: Arusha Arusha 5 2021 -3.36968 36.68808 469.00
## 2: Dar es Salaam Temeke 5 2021 -6.85097 39.25672 485.00
## 3: Dodoma Majengo 5 2021 -6.17971 35.74109 501.50
## 4: Dodoma Kibaigwa 5 2021 -6.08110 36.64645 375.00
## 5: Kagera Bukoba 5 2021 -1.33140 31.81293 426.25
## ---
## 1012: Pwani Mnalani 12 2024 -6.44221 38.90622 1300.00
## 1013: Simiyu Bariadi 12 2024 -2.80355 33.98699 720.00
## 1014: Kigoma Kigoma 12 2024 -4.89697 29.65987 722.00
## 1015: Njombe Makambako 12 2024 -8.84398 34.82299 569.50
## 1016: Rukwa Sumbawanga 12 2024 -7.95239 31.62319 676.25
## ric.price sor.price bul.price fin.price whe.price bea.price pot.price
## 1: 1530 695.0 761.0 1333.0 793.00 1545.0 745
## 2: 1650 900.0 900.0 1775.0 1230.00 2420.0 730
## 3: 1592 558.0 581.0 1752.0 NaN 2075.0 618
## 4: NaN 437.5 NaN NaN NaN NaN NaN
## 5: 1260 1375.0 1337.5 1637.5 1637.50 1300.0 675
## ---
## 1012: 1950 1700.0 1600.0 2300.0 2200.00 3250.0 1500
## 1013: 1500 1650.0 1900.0 2300.0 2500.00 2950.0 1450
## 1014: 1600 900.0 1600.0 2300.0 1750.00 2700.0 1000
## 1015: 2375 850.0 NaN 1625.0 1300.00 2550.0 590
## 1016: 1950 NaN NaN 975.0 843.75 2162.5 725
# Create a function to determine the season
get_season <- function(month) {
if (month %in% 5:10) {
return("Post-Harvest")
} else {
return("Lean Season")
}
}
# Add a 'Season' column to the data
prices.monthly$Season <- sapply(prices.monthly$Month, get_season)
# Post Harvest Data
post_harvest_data <- prices.monthly[prices.monthly$Season == "Post-Harvest", ]
# Remove rows with NaNs from the Post-Harvest data
post_harvest_data <- post_harvest_data[complete.cases(post_harvest_data[, 7:14]), ]
# Lean Season data
lean_season_data <- prices.monthly[prices.monthly$Season == "Lean Season", ]
# Remove rows with NaNs from the Lean Season data
lean_season_data <- lean_season_data[complete.cases(lean_season_data[, 7:14]), ]
# Calculate correlation matrix for Post-Harvest season
post_harvest_corr <- cor(post_harvest_data[, 7:14])
# Calculate correlation matrix for Lean Season
lean_season_corr <- cor(lean_season_data[, 7:14])
# Plot correlation matrix for Post-Harvest season
corrplot(post_harvest_corr,
method = "color",
title = "",
tl.col = "black",
tl.cex = 0.5,
addCoef.col = "black",
number.cex = 0.5,
number.digits = 2)
# Add title to the plot
title(main = "Post-Harvest Correlation Matrix",
line = 3,
cex.main = 0.9)
# Plot correlation matrix for Lean Season
corrplot(lean_season_corr,
method = "color",
title = "",
tl.col = "black",
tl.cex = 0.5,
addCoef.col = "black",
number.cex = 0.5,
number.digits = 2)
# Add title to the plot
title(main = "Lean Season Correlation Matrix",
line = 3,
cex.main = 0.9)
We are comparing pooled vs Crop Specific Price Prediction models to determine which model performs better at prediction. This is done by comparing their prediction’s r2 or rmse respectively.
Here the dataset is split once (70:30). The model is trained on one part of the data (70%) and evaluated on the remaining set of the data (30%).
# Pooled RF model
##Split the data into train and test datasets
set.seed(1983)
mypts <- as.data.frame(mypts)
rows <- sample(x=1:nrow(mypts), size = 0.70* nrow(mypts))
train <- mypts[rows, ]
test <- mypts[! rownames(mypts) %in% rows, ]
# Hyperparameter tuning
x_vars1 <- train %>%
dplyr::select(-pkg)
y_var1 <- train$pkg
trf1 <- tuneRF(x = x_vars1,
y = y_var1,
stepFactor = 1.5,
improve = 0.01,
ntreeTry = 500,
trace = TRUE,
plot = TRUE)
## mtry = 18 OOB error = 51080.12
## Searching left ...
## mtry = 12 OOB error = 59798.62
## -0.1706828 0.01
## Searching right ...
## mtry = 27 OOB error = 46624.39
## 0.0872302 0.01
## mtry = 40 OOB error = 44530.98
## 0.04489964 0.01
## mtry = 56 OOB error = 44689.83
## -0.003567181 0.01
# number of valid predictors (excluding the target variable)
n_predictors1 <- length(setdiff(names(train), c("pkg")))
n_predictors1
## [1] 56
# Extract the mtry value that gave the lowest OOB error
best_mtry1 <- trf1[which.min(trf1[, 2]), 1]
best_mtry1
## [1] 40
# Ensure best_mtry is within valid range
best_mtry1 <- min(best_mtry1, n_predictors1)
best_mtry1
## [1] 40
RF <- randomForest(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans +
Month + Year +
#latitude + longitude +
ttport_1 + ttcity_u5 + popdens +
bio_3 + bio_6 + bio_9 + bio_12 + bio_18 +
rain.sum.lag,
data = train,
mtry = best_mtry1,
ntree = 500,
importance = TRUE,
na.action = na.omit)
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
RF
##
## Call:
## randomForest(formula = pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + Month + Year + ttport_1 + ttcity_u5 + popdens + bio_3 + bio_6 + bio_9 + bio_12 + bio_18 + rain.sum.lag, data = train, mtry = best_mtry1, ntree = 500, importance = TRUE, na.action = na.omit)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 18
##
## Mean of squared residuals: 48143.99
## % Var explained: 90.93
evaluate_models <- function(crop) {
# Filter data for the specific crop
crop_data <- mypts[mypts$Crop == crop, ]
set.seed(1983)
rownames(crop_data)<-1:nrow(crop_data)
rows <- sample(x=1:nrow(crop_data), size = 0.70* nrow(crop_data))
training_data_crop <- crop_data[rows, ]
validation_data_crop <- crop_data[! rownames(crop_data) %in% rows, ]
# Predictions for pooled model on specific crop data
pooled_pred_crop <- predict(RF, newdata = validation_data_crop)
actual_pooled <- validation_data_crop$pkg
result_pooled <-data.frame(actual=actual_pooled, predicted=pooled_pred_crop)
# Crop-Specific Model
rf_crop <- randomForest(pkg ~ Month + Year +
#latitude + longitude +
ttport_1 + ttcity_u5 + popdens +
bio_3 + bio_6 + bio_9 + bio_12 + bio_18 +
rain.sum.lag,
data = training_data_crop,
mtry = best_mtry1,
ntree = 500,
importance = TRUE,
na.action = na.omit)
# Predictions for crop-specific model
predictions_crop <- predict(rf_crop, newdata = validation_data_crop)
actual_crop <- validation_data_crop$pkg
result_crop_specific <- data.frame(actual=actual_crop, predicted=predictions_crop)
# Calculate performance metrics
rmse_pooled <- round(sqrt(mean((result_pooled$actual-result_pooled$predicted)^2 , na.rm = TRUE )),2)
r2_pooled <- round(summary(lm(actual~predicted, result_pooled))$r.squared,2)
rmse_crop <- round(sqrt(mean((result_crop_specific$actual-result_crop_specific$predicted)^2 , na.rm = TRUE )),2)
r2_crop <- round(summary(lm(actual~predicted, result_crop_specific))$r.squared,2)
return(data.frame(Crop = crop,
Model = c("Pooled", "Crop-Specific"),
RMSE = c(rmse_pooled, rmse_crop),
R_squared = c(r2_pooled, r2_crop)))
}
# Apply function to all crops
crop <- unique(mypts$Crop)
comparison_df3 <- do.call(rbind, lapply(crop, evaluate_models))
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
comparison_df3
## Crop Model RMSE R_squared
## 1 Maize Pooled 67.70 0.94
## 2 Maize Crop-Specific 98.95 0.87
## 3 Rice Pooled 116.48 0.96
## 4 Rice Crop-Specific 193.17 0.89
## 5 Sorghum Pooled 152.76 0.90
## 6 Sorghum Crop-Specific 241.93 0.73
## 7 B.Millet Pooled 218.09 0.85
## 8 B.Millet Crop-Specific 324.53 0.64
## 9 F.Millet Pooled 160.53 0.77
## 10 F.Millet Crop-Specific 226.58 0.54
## 11 Wheat Pooled 208.57 0.88
## 12 Wheat Crop-Specific 330.04 0.67
## 13 Beans Pooled 152.76 0.93
## 14 Beans Crop-Specific 210.43 0.86
## 15 Potato Pooled 88.54 0.89
## 16 Potato Crop-Specific 145.04 0.70
comparison_df_wide3 <- comparison_df3 %>%
pivot_wider(names_from = Model, values_from = c(RMSE, R_squared)) %>%
rename(RMSE_Crop_Specific = `RMSE_Crop-Specific`,
RMSE_Pooled = `RMSE_Pooled`,
R_squared_Crop_Specific = `R_squared_Crop-Specific`,
R_squared_Pooled = `R_squared_Pooled`)
knitr::kable(comparison_df_wide3, caption = "Model Comparison: R² and RMSE Based on Train-Test Split Validation")
Crop | RMSE_Pooled | RMSE_Crop_Specific | R_squared_Pooled | R_squared_Crop_Specific |
---|---|---|---|---|
Maize | 67.70 | 98.95 | 0.94 | 0.87 |
Rice | 116.48 | 193.17 | 0.96 | 0.89 |
Sorghum | 152.76 | 241.93 | 0.90 | 0.73 |
B.Millet | 218.09 | 324.53 | 0.85 | 0.64 |
F.Millet | 160.53 | 226.58 | 0.77 | 0.54 |
Wheat | 208.57 | 330.04 | 0.88 | 0.67 |
Beans | 152.76 | 210.43 | 0.93 | 0.86 |
Potato | 88.54 | 145.04 | 0.89 | 0.70 |
write.csv(comparison_df_wide3, "H:/Tanzania Price data/Datasets/figures_and_maps/Train-Test-Validation-results.csv", row.names = FALSE)
Here we are using 2024 data for validation.
# Comparison between Pooled model and Crop Specific Model
set.seed(1983)
evaluate_models <- function(crop) {
# Filter data for the specific crop
crop_data <- mypts[mypts$Crop == crop, ]
training_data_crop <- crop_data[crop_data$Year %in% c(2021, 2022, 2023), ]
training_data_crop <- as.data.frame(training_data_crop)
validation_data_crop <- crop_data[crop_data$Year == 2024, ]
validation_data_crop <- as.data.frame(validation_data_crop)
# Predictions for pooled model on specific crop data
pooled_pred_crop <- predict(rf, newdata = validation_data_crop)
actual_pooled <- validation_data_crop$pkg
result_pooled <-data.frame(actual=actual_pooled, predicted=pooled_pred_crop)
# Crop-Specific RF Model
rf_crop <- randomForest(pkg ~ Month + Year + ttport_1 + ttcity_u5 + popdens + bio_3 + bio_6 + bio_9 + bio_12 + bio_18 + rain.sum.lag,
data = training_data_crop, na.rm = TRUE)
# Predictions for crop-specific model
predictions_crop <- predict(rf_crop, newdata = validation_data_crop)
actual_crop <- validation_data_crop$pkg
result_crop_specific <- data.frame(actual=actual_crop, predicted=predictions_crop)
# Calculate performance metrics
rmse_pooled <- round(sqrt(mean((result_pooled$actual-result_pooled$predicted)^2 , na.rm = TRUE )),2)
r2_pooled <- round(summary(lm(actual~predicted, result_pooled))$r.squared,2)
rmse_crop <- round(sqrt(mean((result_crop_specific$actual-result_crop_specific$predicted)^2 , na.rm = TRUE )),2)
r2_crop <- round(summary(lm(actual~predicted, result_crop_specific))$r.squared,2)
return(data.frame(Crop = crop,
Model = c("Pooled", "Crop-Specific"),
RMSE = c(rmse_pooled, rmse_crop),
R_squared = c(r2_pooled, r2_crop)))
}
# Apply the function to all crops
crop <- unique(mypts$Crop)
set.seed(1983)
comparison_df <- do.call(rbind, lapply(crop, evaluate_models))
comparison_df
## Crop Model RMSE R_squared
## 1 Maize Pooled 333.35 0.31
## 2 Maize Crop-Specific 312.19 0.30
## 3 Rice Pooled 574.12 0.27
## 4 Rice Crop-Specific 554.00 0.25
## 5 Sorghum Pooled 321.53 0.52
## 6 Sorghum Crop-Specific 322.10 0.52
## 7 B.Millet Pooled 476.59 0.34
## 8 B.Millet Crop-Specific 472.71 0.35
## 9 F.Millet Pooled 323.27 0.29
## 10 F.Millet Crop-Specific 322.83 0.28
## 11 Wheat Pooled 493.90 0.39
## 12 Wheat Crop-Specific 507.50 0.34
## 13 Beans Pooled 348.09 0.34
## 14 Beans Crop-Specific 344.75 0.35
## 15 Potato Pooled 322.90 0.00
## 16 Potato Crop-Specific 321.77 0.00
#write.csv(comparison_df, "model_comparison_df.csv")
#comparison_df <- read.csv("model_comparison_df.csv")
comparison_df_wide <- comparison_df %>%
pivot_wider(names_from = Model, values_from = c(RMSE, R_squared)) %>%
rename(
RMSE_Crop_Specific = `RMSE_Crop-Specific`,
RMSE_Pooled = `RMSE_Pooled`,
R_squared_Crop_Specific = `R_squared_Crop-Specific`,
R_squared_Pooled = `R_squared_Pooled`
) %>%
group_by(Crop) %>%
summarize(
RMSE_Pooled = max(RMSE_Pooled, na.rm = TRUE),
RMSE_Crop_Specific = max(RMSE_Crop_Specific, na.rm = TRUE),
R_squared_Pooled = max(R_squared_Pooled, na.rm = TRUE),
R_squared_Crop_Specific = max(R_squared_Crop_Specific, na.rm = TRUE)
)
knitr::kable(comparison_df_wide, caption = "Model Comparison: R² and RMSE")
Crop | RMSE_Pooled | RMSE_Crop_Specific | R_squared_Pooled | R_squared_Crop_Specific |
---|---|---|---|---|
Maize | 333.35 | 312.19 | 0.31 | 0.30 |
Rice | 574.12 | 554.00 | 0.27 | 0.25 |
Sorghum | 321.53 | 322.10 | 0.52 | 0.52 |
B.Millet | 476.59 | 472.71 | 0.34 | 0.35 |
F.Millet | 323.27 | 322.83 | 0.29 | 0.28 |
Wheat | 493.90 | 507.50 | 0.39 | 0.34 |
Beans | 348.09 | 344.75 | 0.34 | 0.35 |
Potato | 322.90 | 321.77 | 0.00 | 0.00 |
write.csv(comparison_df_wide, "H:/Tanzania Price data/Datasets/figures_and_maps/Temporal-split-results.csv", row.names = FALSE)
This approach involves incorporating lagged Prices as predictors to capture temporal dependencies in the data.
mypts <- mypts %>%
arrange(Crop, Market, Year, Month) %>%
group_by(Crop, Market) %>%
mutate(pkg_lag1 = lag(pkg, 1), # 1-month lag
pkg_lag2 = lag(pkg, 2), # 2-month lag
pkg_lag3 = lag(pkg, 3)) %>% # 3-month lag
ungroup()
mypts <- na.omit(mypts)
# Filter the data for training (May 2021 - Dec 2023)
training_data <- mypts[mypts$Year %in% c(2021, 2022, 2023), ]
# Check training data
#head(training_data)
# Filter the data for validation (Jan 2024 - June 2024)
validation_data <- mypts[mypts$Year == 2024, ]
# Check validation data
#head(validation_data)
rf_ar <- randomForest(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans + potato +
Month + Year +
ttport_1 + ttcity_u5 + popdens +
bio_3 + bio_6 + bio_9 + bio_12 + bio_18 +
rain.sum.lag +
pkg_lag1 + pkg_lag2 + pkg_lag3, # Adding lagged prices
data=training_data, na.rm=TRUE)
varImpPlot(rf_ar)
evaluate_models_ar <- function(crop) {
crop_data <- mypts[mypts$Crop == crop, ]
training_data_crop <- crop_data[crop_data$Year %in% c(2021, 2022, 2023), ]
validation_data_crop <- crop_data[crop_data$Year == 2024, ]
# Predictions for pooled model with lags
pooled_pred_crop <- predict(rf_ar, newdata = validation_data_crop)
actual_pooled <- validation_data_crop$pkg
result_pooled <- data.frame(actual=actual_pooled, predicted=pooled_pred_crop)
# Train crop-specific model with lags
rf_crop_ar <- randomForest(pkg ~ Month + Year + ttport_1 + ttcity_u5 + popdens +
bio_3 + bio_6 + bio_9 + bio_12 + bio_18 + rain.sum.lag +
pkg_lag1 + pkg_lag2 + pkg_lag3, # Adding lagged prices
data = training_data_crop, na.rm = TRUE)
# Predictions for crop-specific model with lags
predictions_crop <- predict(rf_crop_ar, newdata = validation_data_crop)
actual_crop <- validation_data_crop$pkg
result_crop_specific <- data.frame(actual=actual_crop, predicted=predictions_crop)
# Calculate performance metrics
rmse_pooled <- round(sqrt(mean((result_pooled$actual - result_pooled$predicted)^2, na.rm = TRUE)), 2)
r2_pooled <- round(summary(lm(actual ~ predicted, result_pooled))$r.squared, 2)
rmse_crop <- round(sqrt(mean((result_crop_specific$actual - result_crop_specific$predicted)^2, na.rm = TRUE)), 2)
r2_crop <- round(summary(lm(actual ~ predicted, result_crop_specific))$r.squared, 2)
return(data.frame(Crop = crop,
Model = c("Pooled_AR", "Crop-Specific_AR"),
RMSE = c(rmse_pooled, rmse_crop),
R_squared = c(r2_pooled, r2_crop)))
}
# Run model evaluation for all crops
set.seed(1983)
comparison_df_ar <- do.call(rbind, lapply(unique(mypts$Crop), evaluate_models_ar))
print(comparison_df_ar)
## Crop Model RMSE R_squared
## 1 Maize Pooled_AR 151.62 0.60
## 2 Maize Crop-Specific_AR 140.22 0.65
## 3 Rice Pooled_AR 240.38 0.78
## 4 Rice Crop-Specific_AR 292.02 0.76
## 5 Sorghum Pooled_AR 231.97 0.74
## 6 Sorghum Crop-Specific_AR 246.03 0.74
## 7 B.Millet Pooled_AR 286.63 0.77
## 8 B.Millet Crop-Specific_AR 357.36 0.67
## 9 F.Millet Pooled_AR 192.20 0.72
## 10 F.Millet Crop-Specific_AR 203.13 0.69
## 11 Wheat Pooled_AR 276.14 0.82
## 12 Wheat Crop-Specific_AR 334.23 0.76
## 13 Beans Pooled_AR 219.38 0.70
## 14 Beans Crop-Specific_AR 224.78 0.70
## 15 Potato Pooled_AR 140.16 0.73
## 16 Potato Crop-Specific_AR 189.91 0.61
#comparison_df <- read.csv("model_comparison_df.csv")
comparison_df_wide_ar <- comparison_df_ar %>%
pivot_wider(names_from = Model, values_from = c(RMSE, R_squared)) %>%
rename(
RMSE_Crop_Specific = `RMSE_Crop-Specific_AR`,
RMSE_Pooled = `RMSE_Pooled_AR`,
R_squared_Crop_Specific = `R_squared_Crop-Specific_AR`,
R_squared_Pooled = `R_squared_Pooled_AR`
) %>%
group_by(Crop) %>%
summarize(
RMSE_Pooled = max(RMSE_Pooled, na.rm = TRUE),
RMSE_Crop_Specific = max(RMSE_Crop_Specific, na.rm = TRUE),
R_squared_Pooled = max(R_squared_Pooled, na.rm = TRUE),
R_squared_Crop_Specific = max(R_squared_Crop_Specific, na.rm = TRUE)
)
knitr::kable(comparison_df_wide_ar, caption = "Model Comparison: R² and RMSE for Autoregressive Random Forest")
Crop | RMSE_Pooled | RMSE_Crop_Specific | R_squared_Pooled | R_squared_Crop_Specific |
---|---|---|---|---|
Maize | 151.62 | 140.22 | 0.60 | 0.65 |
Rice | 240.38 | 292.02 | 0.78 | 0.76 |
Sorghum | 231.97 | 246.03 | 0.74 | 0.74 |
B.Millet | 286.63 | 357.36 | 0.77 | 0.67 |
F.Millet | 192.20 | 203.13 | 0.72 | 0.69 |
Wheat | 276.14 | 334.23 | 0.82 | 0.76 |
Beans | 219.38 | 224.78 | 0.70 | 0.70 |
Potato | 140.16 | 189.91 | 0.73 | 0.61 |
write.csv(comparison_df_wide_ar, "H:/Tanzania Price data/Datasets/figures_and_maps/Temporal-split-results_AR.csv", row.names = FALSE)
This analysis evaluates the spatial generalizability of a random forest model for predicting crop prices (pkg) across 44 markets. Using a leave-N-markets-out cross-validation approach, we assess how model performance (measured by R²) changes as we incrementally increase the number of markets held out from model training.
For each number of held-out markets (num_holdout from 1 to 43), the process is repeated 150 times with different random market combinations. A random forest is trained on the remaining markets and evaluated on the held-out ones. Average and standard deviation of R² values across repetitions are computed for each holdout size.
A segmented (piecewise) regression is then fitted to the resulting data to visualize trends in prediction accuracy as market coverage declines.
registerDoParallel(cores = parallel::detectCores() - 1)
set.seed(1983)
unique_markets <- unique(mypts$Market)
total_markets <- length(unique_markets)
market_holdout_results <- list()
for (num_holdout in 1:(total_markets - 1)) {
r2_vals <- foreach(rep = 1:150, .combine = c, .packages = c("randomForest", "tidyverse")) %dopar% {
heldout_markets <- sample(unique_markets, num_holdout)
train_subset <- mypts %>% filter(!Market %in% heldout_markets)
test_subset <- mypts %>% filter(Market %in% heldout_markets)
model_rf <- randomForest(pkg ~ maize + rice + sorghum + bmillet + fmillet + wheat + beans +
Month + Year +
latitude + longitude +
ttport_1 + ttcity_u5 + popdens +
bio_3 + bio_6 + bio_9 + bio_12 + bio_18 +
rain.sum.lag,
data = train_subset,
ntree = 500,
importance = TRUE,
na.action = na.omit)
preds <- predict(model_rf, newdata = test_subset)
actual_vals <- test_subset$pkg
if (length(unique(actual_vals)) > 1) {
return(summary(lm(actual_vals ~ preds))$r.squared)
} else {
return(NA_real_)
}
}
market_holdout_results[[num_holdout]] <- data.frame(
Markets_Heldout = num_holdout,
Avg_R2 = mean(r2_vals, na.rm = TRUE),
SD_R2 = sd(r2_vals, na.rm = TRUE)
)
cat("Done with num_holdout =", num_holdout, "\n")
}
## Done with num_holdout = 1
## Done with num_holdout = 2
## Done with num_holdout = 3
## Done with num_holdout = 4
## Done with num_holdout = 5
## Done with num_holdout = 6
## Done with num_holdout = 7
## Done with num_holdout = 8
## Done with num_holdout = 9
## Done with num_holdout = 10
## Done with num_holdout = 11
## Done with num_holdout = 12
## Done with num_holdout = 13
## Done with num_holdout = 14
## Done with num_holdout = 15
## Done with num_holdout = 16
## Done with num_holdout = 17
## Done with num_holdout = 18
## Done with num_holdout = 19
## Done with num_holdout = 20
## Done with num_holdout = 21
## Done with num_holdout = 22
## Done with num_holdout = 23
## Done with num_holdout = 24
## Done with num_holdout = 25
## Done with num_holdout = 26
## Done with num_holdout = 27
## Done with num_holdout = 28
## Done with num_holdout = 29
## Done with num_holdout = 30
## Done with num_holdout = 31
## Done with num_holdout = 32
## Done with num_holdout = 33
## Done with num_holdout = 34
## Done with num_holdout = 35
## Done with num_holdout = 36
## Done with num_holdout = 37
## Done with num_holdout = 38
## Done with num_holdout = 39
## Done with num_holdout = 40
market_holdout_df <- bind_rows(market_holdout_results)
market_holdout_df
## Markets_Heldout Avg_R2 SD_R2
## 1 1 0.8259799 0.11637608
## 2 2 0.7995875 0.09807273
## 3 3 0.7678180 0.08388532
## 4 4 0.7680966 0.08177494
## 5 5 0.7662437 0.07282222
## 6 6 0.7627990 0.06330309
## 7 7 0.7597887 0.06289671
## 8 8 0.7552103 0.05704274
## 9 9 0.7512195 0.05219986
## 10 10 0.7642668 0.05028724
## 11 11 0.7426681 0.04645879
## 12 12 0.7467461 0.04788667
## 13 13 0.7459298 0.04550725
## 14 14 0.7441888 0.03770207
## 15 15 0.7448953 0.03411553
## 16 16 0.7472485 0.03776240
## 17 17 0.7463233 0.03897123
## 18 18 0.7358729 0.03029684
## 19 19 0.7349270 0.03271942
## 20 20 0.7339903 0.03034002
## 21 21 0.7378558 0.03011041
## 22 22 0.7313638 0.02901735
## 23 23 0.7288484 0.02928343
## 24 24 0.7254300 0.02698280
## 25 25 0.7229597 0.02266863
## 26 26 0.7206346 0.02547939
## 27 27 0.7159822 0.02470323
## 28 28 0.7181532 0.02490136
## 29 29 0.7111819 0.02288379
## 30 30 0.7057061 0.02556775
## 31 31 0.7003142 0.02692491
## 32 32 0.6948144 0.02673874
## 33 33 0.6853677 0.02889688
## 34 34 0.6842083 0.03142871
## 35 35 0.6708637 0.03193291
## 36 36 0.6610183 0.04050358
## 37 37 0.6453392 0.04781050
## 38 38 0.6334470 0.05218160
## 39 39 0.5949594 0.06873558
## 40 40 0.5356862 0.10487264
write.csv(market_holdout_df, "market_holdout_df2.csv", row.names = FALSE)
# To better show the downward trend as you hold out more markets
# Fit linear model
lm_fit <- lm(Avg_R2 ~ Markets_Heldout, data = market_holdout_df)
# Fit segmented (piecewise) model with breakpoint estimation
seg_fit <- segmented(lm_fit, seg.Z = ~Markets_Heldout)
# Add predicted values to data
market_holdout_df$seg_fit <- predict(seg_fit)
# Plot
ggplot(market_holdout_df, aes(x = Markets_Heldout, y = Avg_R2)) +
geom_ribbon(aes(ymin = Avg_R2 - SD_R2, ymax = Avg_R2 + SD_R2), fill = "grey", alpha = 0.4) +
geom_line(aes(y = seg_fit), color = "black", size = 1) +
geom_point(color = "black") +
labs(title = "",
x = "Number of Markets Held Out",
y = "Mean R²") +
theme_minimal() +
theme(
axis.line = element_line(color = "black", size = 0.6),
axis.line.x = element_line(color = "black", size = 0.6),
axis.line.y = element_line(color = "black", size = 0.6),
panel.border = element_blank()
)
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
dta_reg <- mypts %>%
filter(Crop == "Maize", Month == 8, Year == 2024) %>%
group_by(Region_GADM) %>%
summarise(Reg_max_july_pkg = max(pkg, na.rm = TRUE))
head(dta_reg)
## # A tibble: 6 × 2
## Region_GADM Reg_max_july_pkg
## <chr> <dbl>
## 1 Arusha 639.
## 2 Dar es Salaam 1000
## 3 Dodoma 532.
## 4 Geita 650
## 5 Iringa 471.
## 6 Kagera 662.
We use Dar es Salaam price as the national price bench mark
# Assign national Average (Dar es Salaam price) value to Raster
# National Average value for maize in August is 1000
# create reference raster
tza_extent <- ext(tza1) |> floor()
rnat <- crop(rast(res=1/12), tza_extent)
rnat <- project(rnat, crs(tza1))
values(rnat) <- 1000
rnatavg <- terra::mask(rnat, tza1)
plot(rnatavg)
names(rnatavg) <- "national_avg"
# Njombes price is missing, use Iringa's price
# Extract Iringa's price
njombe_price <- dta_reg %>%
filter(Region_GADM == "Iringa") %>%
mutate(Region_GADM = "Njombe")
njombe_price
## # A tibble: 1 × 2
## Region_GADM Reg_max_july_pkg
## <chr> <dbl>
## 1 Njombe 471.
# Add to regional summary
dta_reg <- bind_rows(dta_reg, njombe_price)
filter(dta_reg, Region_GADM == "Njombe")
## # A tibble: 1 × 2
## Region_GADM Reg_max_july_pkg
## <chr> <dbl>
## 1 Njombe 471.
#spatial join with the tza1
tza1_with_reg_prices <- merge(tza1, dta_reg, by.x = "NAME_1", by.y = "Region_GADM", all.x = TRUE)
head(tza1_with_reg_prices)
## NAME_1 GID_1 GID_0 COUNTRY VARNAME_1 NL_NAME_1 TYPE_1
## 1 Arusha TZA.1_1 TZA Tanzania <NA> <NA> Mkoa
## 2 Dar es Salaam TZA.2_1 TZA Tanzania Dar-es-salaam|Pwani <NA> Mkoa
## 3 Dodoma TZA.3_1 TZA Tanzania <NA> <NA> Mkoa
## 4 Geita TZA.4_1 TZA Tanzania <NA> <NA> Mkoa
## 5 Iringa TZA.5_1 TZA Tanzania <NA> <NA> Mkoa
## 6 Kagera TZA.6_1 TZA Tanzania <NA> <NA> Mkoa
## ENGTYPE_1 CC_1 HASC_1 ISO_1 Reg_max_july_pkg
## 1 Region 02 TZ.AS TZ-01 639.0909
## 2 Region 07 TZ.DS TZ-02 1000.0000
## 3 Region 01 TZ.DO TZ-03 531.8182
## 4 Region 25 TZ.GE TZ-27 650.0000
## 5 Region 11 TZ.IG TZ-04 471.3636
## 6 Region 18 TZ.KG TZ-05 661.8182
unique(tza1_with_reg_prices$NAME_1)
## [1] "Arusha" "Dar es Salaam" "Dodoma" "Geita"
## [5] "Iringa" "Kagera" "Kaskazini Pemba" "Kaskazini Unguja"
## [9] "Katavi" "Kigoma" "Kilimanjaro" "Kusini Pemba"
## [13] "Kusini Unguja" "Lindi" "Manyara" "Mara"
## [17] "Mbeya" "Mjini Magharibi" "Morogoro" "Mtwara"
## [21] "Mwanza" "Njombe" "Pwani" "Rukwa"
## [25] "Ruvuma" "Shinyanga" "Simiyu" "Singida"
## [29] "Songwe" "Tabora" "Tanga"
#plot(tza1_with_reg_prices)
#text(tza1_with_reg_prices, label="NAME_1")
# convert the shapefile with the price data to a raster
regional_raster <- rasterize(tza1_with_reg_prices, rast(res=1/12, extent=ext(tza1), crs=crs(tza1)), field = "Reg_max_july_pkg")
regional_raster <- resample(regional_raster, r)
plot(regional_raster)
# Bring in predicted june prices for maize 2024
pred_maize <- rast("H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_08.tif")
pred_maize
## class : SpatRaster
## dimensions : 144, 144, 1 (nrow, ncol, nlyr)
## resolution : 0.08333333, 0.08333333 (x, y)
## extent : 29, 41, -12, 0 (xmin, xmax, ymin, ymax)
## coord. ref. : lon/lat WGS 84 (EPSG:4326)
## source : maize_price_rf_pred_08.tif
## name : lyr1
## min value : 770.4785
## max value : 1224.6805
pred_maize <- project(pred_maize, crs(tza1))
names(pred_maize) <- "pred_maize"
# 3 plots in one row
par(mfrow = c(1, 3), mar = c(4, 4, 4, 1))
color_palette <- brewer.pal(9, "YlGnBu")
# Dar es Salaam Price Map
plot(rnatavg,
col = color_palette,
main = "Dar es Salaam Price",
axes = TRUE, box = TRUE)
plot(tza1, add = TRUE, border = "gray30", lwd = 0.5)
# -------------------------------------------
# Regional Prices
plot(regional_raster,
col = color_palette,
main = "Regional Prices",
axes = TRUE, box = TRUE)
plot(tza1, add = TRUE, border = "gray30", lwd = 0.5)
# -------------------------------------------
# Predicted Prices Map
plot(pred_maize,
col = color_palette,
main = "Predicted Prices",
axes = TRUE, box = TRUE)
plot(tza1, add = TRUE, border = "gray30", lwd = 0.5)
# Pairwise comparison
# Extract values from rasters
pred_values <- values(pred_maize)
nat_values <- values(rnatavg)
reg_values <- values(regional_raster)
# Identify rows with all finite values
valid_idx <- which(is.finite(pred_values) & is.finite(nat_values) & is.finite(reg_values))
# Filter vectors
pred_values_f <- pred_values[valid_idx]
nat_values_f <- nat_values[valid_idx]
reg_values_f <- reg_values[valid_idx]
#dist_values_f <- dist_values[valid_idx]
# Create a data frame from the filtered values
comparison_df <- data.frame(
Predicted = pred_values_f,
National_Avg = nat_values_f,
Regional_Avg = reg_values_f
)
# Define pairwise differences
comparison_df <- comparison_df %>%
mutate(
Pred_vs_Nat = National_Avg - Predicted,
Pred_vs_Reg = Regional_Avg - Predicted,
#pred_vs_dist = Predicted - District_Avg
)
head(comparison_df)
## Predicted National_Avg Regional_Avg Pred_vs_Nat Pred_vs_Reg
## 1 968.2156 1000 661.8182 31.784424 -306.3974
## 2 967.7175 1000 661.8182 32.282532 -305.8993
## 3 974.8852 1000 661.8182 25.114807 -313.0670
## 4 986.2094 1000 661.8182 13.790588 -324.3912
## 5 985.2719 1000 661.8182 14.728088 -323.4537
## 6 991.7216 1000 661.8182 8.278442 -329.9034
# % of predicted prices compared to national and regional
mean(comparison_df$Pred_vs_Nat > 0)
## [1] 0.7207987
mean(comparison_df$Pred_vs_Reg > 0)
## [1] 0.03451062
# Difference: National Average - Predicted
diff_nat <- rnatavg - pred_maize
# Difference: Regional Average - Predicted
diff_reg <- regional_raster - pred_maize
# Plot National vs Predicted difference
color_palette <- colorRampPalette(c("darkred", "wheat", "darkgreen"))(100)
plot(diff_nat, main = "National Avg - Predicted", col = color_palette)
plot(diff_reg, main = "Regional Avg - Predicted", col = color_palette)
The OSM dataset used was obtained from Tanzania Populated Places (OpenStreetMap Export) - https://data.humdata.org/dataset/hotosm_tza_populated_places?
# Extract Predicted price using OSM Data
# Bring in Tanzania Populated Places from OSM
TZ_populated_places <- vect("H:\\Tanzania Price data\\Datasets\\OSM_Population\\hotosm_tza_populated_places_points_shp.shp")
# head(TZ_populated_places)
# Plot All data points
plot(tza1)
plot(TZ_populated_places, col="Red", add=TRUE)
sapply(TZ_populated_places, class)
## name name_en place population is_in source
## "character" "character" "character" "character" "character" "character"
## name_sw osm_id osm_type
## "character" "integer" "character"
# Ensure that the population column is numeric
TZ_populated_places$population <- as.numeric(TZ_populated_places$population)
# Filter to remain with rows where population is 20,000 or more
TZ_populated_places_filtered <- TZ_populated_places_filtered <- TZ_populated_places[!is.na(TZ_populated_places$population) & TZ_populated_places$population >= 20000, ]
# Plot places where population data is provided
plot(tza1)
plot(TZ_populated_places_filtered, col="Red", add=TRUE)
# Read the predicted price raster data for Maize
output_dir_Maize <- "H:/Tanzania Price data/Datasets/Pred-plots/Maize"
Maize_price_list <- list.files(output_dir_Maize, pattern = ".tif$", full.names = TRUE)
Maize_price_list
## [1] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_01.tif"
## [2] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_02.tif"
## [3] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_03.tif"
## [4] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_04.tif"
## [5] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_05.tif"
## [6] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_06.tif"
## [7] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_07.tif"
## [8] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_08.tif"
## [9] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_09.tif"
## [10] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_10.tif"
## [11] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_11.tif"
## [12] "H:/Tanzania Price data/Datasets/Pred-plots/Maize/maize_price_rf_pred_12.tif"
Maize_price_rast <- rast(Maize_price_list)
# Extract predicted maize prices for OSM points
osm_points_with_prices <- TZ_populated_places_filtered
# Loop over each month and extract the maize price for OSM points
for (i in 1:nlyr(Maize_price_rast)) {
# Extract prices for month i
prices_for_month <- terra::extract(Maize_price_rast[[i]], TZ_populated_places_filtered, method = "bilinear")
# Add the extracted prices to the SpatVector (each month as a separate column)
osm_points_with_prices[[paste0("maize_price_month_", sprintf("%02d", i))]] <- prices_for_month[,2]
}
# Filter to remain only with necessary columns
osm_points_with_prices_select <- osm_points_with_prices[, c("name",
"place", "is_in", "population", "osm_id", "maize_price_month_01", "maize_price_month_02", "maize_price_month_03", "maize_price_month_04",
"maize_price_month_05", "maize_price_month_06", "maize_price_month_07",
"maize_price_month_08")]
osm_points_with_prices_select <- as.data.frame(osm_points_with_prices_select)
# Reshape to long format with a Month and price columns
osm_points_long <- osm_points_with_prices_select %>%
pivot_longer(cols = starts_with("maize_price_month_"),
names_to = "Month",
values_to = "price")
# Clean month column to extract only the month number
osm_points_long$Month <- as.numeric(gsub("maize_price_month_", "", osm_points_long$Month))
# Add a new column crop with Maize
osm_points_long$crop <- "maize"
osm_points_long <- as.data.table(osm_points_long)
ppts <- head(osm_points_long, 50)
knitr::kable(ppts, caption = "Extracted Maize Prices For Different Markets in Tanzania For the Year 2024")
name | place | is_in | population | osm_id | Month | price | crop |
---|---|---|---|---|---|---|---|
Geita | city | NA | 318006 | 258307668 | 1 | 1147.6372 | maize |
Geita | city | NA | 318006 | 258307668 | 2 | 1124.2366 | maize |
Geita | city | NA | 318006 | 258307668 | 3 | 1128.6222 | maize |
Geita | city | NA | 318006 | 258307668 | 4 | 1076.5419 | maize |
Geita | city | NA | 318006 | 258307668 | 5 | 1076.3618 | maize |
Geita | city | NA | 318006 | 258307668 | 6 | 1062.8699 | maize |
Geita | city | NA | 318006 | 258307668 | 7 | 1039.7928 | maize |
Geita | city | NA | 318006 | 258307668 | 8 | 1031.7062 | maize |
Mafinga | town | Iringa, Tanzania | 99305 | 258505599 | 1 | 968.8280 | maize |
Mafinga | town | Iringa, Tanzania | 99305 | 258505599 | 2 | 933.6967 | maize |
Mafinga | town | Iringa, Tanzania | 99305 | 258505599 | 3 | 925.6108 | maize |
Mafinga | town | Iringa, Tanzania | 99305 | 258505599 | 4 | 845.6731 | maize |
Mafinga | town | Iringa, Tanzania | 99305 | 258505599 | 5 | 856.6167 | maize |
Mafinga | town | Iringa, Tanzania | 99305 | 258505599 | 6 | 829.1568 | maize |
Mafinga | town | Iringa, Tanzania | 99305 | 258505599 | 7 | 791.8342 | maize |
Mafinga | town | Iringa, Tanzania | 99305 | 258505599 | 8 | 808.7090 | maize |
Masumbwe | town | Shinyanga, Tanzania | 50000 | 259713894 | 1 | NaN | maize |
Masumbwe | town | Shinyanga, Tanzania | 50000 | 259713894 | 2 | NaN | maize |
Masumbwe | town | Shinyanga, Tanzania | 50000 | 259713894 | 3 | NaN | maize |
Masumbwe | town | Shinyanga, Tanzania | 50000 | 259713894 | 4 | NaN | maize |
Masumbwe | town | Shinyanga, Tanzania | 50000 | 259713894 | 5 | NaN | maize |
Masumbwe | town | Shinyanga, Tanzania | 50000 | 259713894 | 6 | NaN | maize |
Masumbwe | town | Shinyanga, Tanzania | 50000 | 259713894 | 7 | NaN | maize |
Masumbwe | town | Shinyanga, Tanzania | 50000 | 259713894 | 8 | NaN | maize |
Vwawa | town | Mbeya, Tanzania | 85000 | 262107786 | 1 | 973.3528 | maize |
Vwawa | town | Mbeya, Tanzania | 85000 | 262107786 | 2 | 1007.1622 | maize |
Vwawa | town | Mbeya, Tanzania | 85000 | 262107786 | 3 | 988.7527 | maize |
Vwawa | town | Mbeya, Tanzania | 85000 | 262107786 | 4 | 946.6226 | maize |
Vwawa | town | Mbeya, Tanzania | 85000 | 262107786 | 5 | 900.4689 | maize |
Vwawa | town | Mbeya, Tanzania | 85000 | 262107786 | 6 | 878.1998 | maize |
Vwawa | town | Mbeya, Tanzania | 85000 | 262107786 | 7 | 788.1307 | maize |
Vwawa | town | Mbeya, Tanzania | 85000 | 262107786 | 8 | 795.7721 | maize |
Chake-Chake | town | NA | 52047 | 622514835 | 1 | 1185.8481 | maize |
Chake-Chake | town | NA | 52047 | 622514835 | 2 | 1167.2443 | maize |
Chake-Chake | town | NA | 52047 | 622514835 | 3 | 1180.6847 | maize |
Chake-Chake | town | NA | 52047 | 622514835 | 4 | 1122.8524 | maize |
Chake-Chake | town | NA | 52047 | 622514835 | 5 | 1139.6604 | maize |
Chake-Chake | town | NA | 52047 | 622514835 | 6 | 1139.7986 | maize |
Chake-Chake | town | NA | 52047 | 622514835 | 7 | 1131.8901 | maize |
Chake-Chake | town | NA | 52047 | 622514835 | 8 | 1040.9110 | maize |
Babati | town | NA | 67445 | -1047512940 | 1 | 1112.4290 | maize |
Babati | town | NA | 67445 | -1047512940 | 2 | 1097.9500 | maize |
Babati | town | NA | 67445 | -1047512940 | 3 | 1118.4940 | maize |
Babati | town | NA | 67445 | -1047512940 | 4 | 1076.5506 | maize |
Babati | town | NA | 67445 | -1047512940 | 5 | 1053.6470 | maize |
Babati | town | NA | 67445 | -1047512940 | 6 | 1030.2886 | maize |
Babati | town | NA | 67445 | -1047512940 | 7 | 1007.8807 | maize |
Babati | town | NA | 67445 | -1047512940 | 8 | 934.3415 | maize |
Vikindu | town | NA | 70000 | 1826809646 | 1 | 1170.4121 | maize |
Vikindu | town | NA | 70000 | 1826809646 | 2 | 1143.0549 | maize |
# Slope from geodata
# Extract slope from srtm using terrain function in terra package
# tza_alt <- elevation_30s("Tanzania", path=".")
# Obtain the slope in radians
# slope_radian <- terrain(tza_alt, "slope", unit="radians", neighbors=8)
# terra::plot(slope_radian, main = "Slope of Tanzania (Radians)")
# Save slope raster
# output_dir <- "H:/Tanzania Price data/Datasets/Cost_surface data"
# output_file <- paste0(output_dir, "/tz_slope_radian.tif")
#writeRaster(slope_radian, output_file, overwrite = TRUE)
# Set Lambert Azimuthal Equal-Area (LAEA) projection centered on Tanzania
laea_tz <- "+proj=laea +lat_0=-6 +lon_0=35 +datum=WGS84 +units=m +no_defs"
# Load and reproject slope layer
slope_radian <- terra::rast("H:/Tanzania Price data/Datasets/Cost_surface data/tz_slope_radian.tif")
# Reproject to LAEA and set resolution to 500m
slope_radian_laea <- terra::project(slope_radian, laea_tz, res = 500,
filename = "slope_laea_500m.tif", overwrite = TRUE)
terra::plot(slope_radian_laea, main = "Slope of Tanzania (Radians)")
# Use the slope layer obtained above to create a decay coefficient
# We use a decay coefficient of 1.5
# Create slope cost surface
decay <- 1.5
slope_cost <- exp(decay * tan(slope_radian_laea))
names(slope_cost) <- "slope_cost"
terra::plot(slope_cost, main = "Slope cost")
# Roads Data from OSM
# Obtain roads data from osmdata. Code is in reagro.
# roads <- geodata::osm("Tanzania", "highways", ".")
# writeVector(roads, "H:/Tanzania Price data/Datasets/Cost_surface data/tz_roads.shp", overwrite = TRUE)
# Load and reproject roads
roads <- terra::vect("H:/Tanzania Price data/Datasets/Cost_surface data/tz_roads.shp")
# Reproject the roads vector to LAEA
roads_laea <- terra::project(roads, laea_tz)
terra::plot(slope_radian_laea)
terra::lines(roads_laea, col="black")
terra::lines(roads_laea[roads_laea$highway == "primary", ], lwd=4, col="red")
terra::lines(roads_laea[roads_laea$highway == "secondary", ], lwd=2, col="blue")
# Create road cost surface
# Create road cost surface
cfile <- "rdcost_laea.tif"
roadtypes <- c("primary", "secondary", "tertiary")
if (!file.exists(cfile)) {
i <- match(roads_laea$highway, roadtypes)
roads_laea$speed <- c(0.001, 0.0015, 0.002)[i]
rd_cost <- rasterize(roads_laea, slope_radian, field=roads_laea$speed, filename=cfile, overwrite=TRUE)
} else {
rd_cost <- terra::rast(cfile)
}
rd_cost_laea <- terra::project(rd_cost, laea_tz, res = 500,
filename = "rd_cost_laea_500m.tif", overwrite = TRUE)
a <- aggregate(rd_cost_laea, 3, min, na.rm=TRUE)
terra::plot(a, col=c("black", "blue", "red"), main = "Road travel cost (min/m)")
# Load land cover layer
tza_lulc <- terra::rast("H:/Tanzania Price data/Datasets/Cost_surface data/TZ_Land_cover_2021.tif")
plot(tza_lulc, main = "WorldCover 10 m 2021 v200 - landcover classes")
terra::lines(tza1)
Table_class <- data.frame(
Value = c(10,20,30,40,50,60,70,80,90, 95),
LandClass = c("Tree cover",
"Shrubland",
"Grassland",
"Cropland",
"Built-up",
"Bare/sparse vegetation",
"Snow and Ice",
"Permanent water bodies",
"Herbaceous wetland",
"Mangroves"),
travel_speeds=c(0.04, 0.02, 0.02, 0.01, 0.01, 0.02, 0.04, 0.11, 0.04, 0.05)
)
knitr::kable(Table_class, caption = "Land Cover classes (ESA WorldCover 10 m 2021 v200)")
Value | LandClass | travel_speeds |
---|---|---|
10 | Tree cover | 0.04 |
20 | Shrubland | 0.02 |
30 | Grassland | 0.02 |
40 | Cropland | 0.01 |
50 | Built-up | 0.01 |
60 | Bare/sparse vegetation | 0.02 |
70 | Snow and Ice | 0.04 |
80 | Permanent water bodies | 0.11 |
90 | Herbaceous wetland | 0.04 |
95 | Mangroves | 0.05 |
rc <- data.frame(from=unique(tza_lulc)[,1], to=0.02)
rc <- data.frame(from=unique(tza_lulc)[,1], to=0.02)
# Adjust Reclassification Table Based on the Classes
#rc <- data.frame(from = Table_class$Value, to = Table_class$travel_speeds)
# Assign specific travel speeds based on land cover class values
rc$to[rc$from %in% c(10)] <- 0.04 # Tree cover
rc$to[rc$from %in% c(20, 30)] <- 0.02 # Shrubland and Grassland
rc$to[rc$from == 40] <- 0.01 # Cropland
rc$to[rc$from == 50] <- 0.01 # Built-up
rc$to[rc$from == 60] <- 0.02 # Bare/sparse vegetation
rc$to[rc$from == 70] <- 0.04 # Snow and Ice
rc$to[rc$from == 80] <- 0.11 # Permanent water bodies
rc$to[rc$from == 90] <- 0.04 # Herbaceous wetland
rc$to[rc$from == 95] <- 0.05 # Mangroves
rc
## from to
## 1 10 0.04
## 2 20 0.02
## 3 30 0.02
## 4 40 0.01
## 5 50 0.01
## 6 60 0.02
## 7 70 0.04
## 8 80 0.11
## 9 90 0.04
## 10 95 0.05
#reclassifying
tza_lc_reclass <- classify(tza_lulc, rc)
##
|---------|---------|---------|---------|
=========================================
lcfname <- "lc_cost.tif"
if (!file.exists(lcfname)) {
# first aggregate to about the same spatial resolution
lc_cost <- aggregate(tza_lc_reclass, 3, mean)
# then resample
lc_cost <- resample(lc_cost, slope_radian, filename=lcfname, wopt=list(names="lc_cost"), overwrite=TRUE)
} else {
lc_cost <- rast(lcfname)
}
lc_cost_laea <- terra::project(lc_cost, laea_tz, res = 500,
filename = "lc_cost_laea_500m.tif", overwrite = TRUE)
terra::plot(lc_cost_laea, main = "Off-road travel costs (min/m) based on land cover class")
# Resample to harmonize the resolution
#lc_cost_resampled <- terra::resample(lc_cost, slope_cost)
#names(lc_cost_resampled) <- "lc_cost"
#rd_cost_resample <- terra::resample(rd_cost, slope_cost)
#names(rd_cost_resample) <- "rd_cost"
# Combine the cost layers
all_cost <- c(rd_cost_laea, lc_cost_laea)
#getting the minimum value of each grid cell
cost <- min(all_cost, na.rm=TRUE)
cost <- cost * slope_cost
terra::plot(cost, main="Final cost layer (min/m)")
# Combine the cost layers
library(gdistance)
cost <- raster(cost)
conductance <- 1/cost
#Creating a transition object
tran <- transition(conductance, transitionFunction=mean, directions= 8)
tran <- geoCorrection(tran, type="c")
save(trans, file = "H:/Tanzania Price data/Datasets/Cost_surface data/transition.rda")
# Towns of 20,000 people or more from OSM
town <- terra::vect("H:/Tanzania Price data/Datasets/Cost_surface data/tz_towns.shp")
# Project the towns to match the CRS of slope_radian
town_prj <- project(town, crs(slope_cost))
# geom(town_prj)
# Extracting coordinates and population from the projected towns
towns <- data.frame(
x = geom(town_prj)[, 3],
y = geom(town_prj)[, 4],
population = town_prj$population # Population from the projected towns
)
# towns
#convert to spatial points needed in gdistance
spTowns <- SpatialPoints(cbind(towns$x, towns$y))
spTowns
## class : SpatialPoints
## features : 94
## extent : -584189.9, 567514.8, -558136.2, 532149.3 (xmin, xmax, ymin, ymax)
## crs : NA
#Estimating
Ac <- accCost(tran, fromCoords=spTowns)
A <- rast(Ac) / 60
AA <- clamp(A, 0, 24) |> mask(slope_radian_laea)
## Warning: [mask] CRS do not match
terra::plot(AA, main="Access to markets (towns > 20k) in Tanzania (hrs)")
terra::lines(roads_laea)
terra::points(town_prj, col="red", pch=20, cex=1.0)
#Load trans object that was created
load("H:/Tanzania Price data/Datasets/Cost_surface data/transition.rda")
terra::plot(raster(tran), main='Transition matrix (minutes)')
terra::plot(town_prj, col = "red", cex = 0.5, add=TRUE)
# We'll use towns our >20k from OSM
town1 <- terra::vect("H:/Tanzania Price data/Datasets/Cost_surface data/tz_towns.shp")
# Extract predicted maize prices for OSM points
# Lets use January Maize price for now
maize_price_jan <- rast("H:\\Tanzania Price data\\Datasets\\Pred-plots\\Maize\\maize_price_rf_pred_01.tif")
maize_price_jan_laea <- terra::project(maize_price_jan, laea_tz, res = 500)
maize_price_jan_osm <- terra::extract(maize_price_jan_laea, town_prj, fun=mean, buffer=2500, small=TRUE)
# the extract function is returning some NA values, complete the list with a mean of the other values
price_values <- maize_price_jan_osm[, 2]
# Replace NAs with the mean of the non-NA values
maize_price_jan_osm[, 2][is.na(price_values)] <- mean(price_values, na.rm = TRUE)
head(maize_price_jan_osm)
## ID lyr1
## 1 1 1147.7495
## 2 2 967.4465
## 3 3 1115.4174
## 4 4 973.3328
## 5 5 1115.4174
## 6 6 1115.4174
library(sp)
town_prj[["maipkg"]] <- maize_price_jan_osm$lyr1
town_sp <- as(town_prj, "Spatial")
#transportation cost. I converted 0.02 usd/kg/h to TZS/kg/hr = 54.43
tr_cost <- 54.43 #TZS/kg/hr
# Folder to store temporary rasters
temp_dir <- "H:/Tanzania Price data/Datasets/Cost_surface data/Temp_fgate_rasters/"
if (!dir.exists(temp_dir)) {
dir.create(temp_dir, recursive = TRUE)
}
# Loop to calculate and save each farmgate price raster
for (x in seq_along(town_sp)) {
tmp.acc <- accCost(tran, town_sp[x,]) / 60
market_price <- as.data.frame(town_sp)[x, "maipkg"]
# Calculate farmgate price
fgate_price <- market_price - (54.43 * tmp.acc)
fgate_price[fgate_price < 0] <- 0
# Save each raster to disk with a unique filename
save_path <- paste0(temp_dir, "fgate_price_", x, ".tif")
terra::writeRaster(fgate_price, save_path, overwrite = TRUE)
}
# Read all saved rasters back into a list
raster_files <- list.files(temp_dir, pattern = "\\.tif$", full.names = TRUE)
fgate_rasters <- lapply(raster_files, terra::rast)
# Combine all rasters into a single SpatRaster stack
mystack_spat <- do.call(c, fgate_rasters)
fgate_price <- max(mystack_spat) %>% resample(.,maize_price_jan_laea) %>% mask(.,maize_price_jan_laea)
##
|---------|---------|---------|---------|
=========================================
maize_price_jan_laea; fgate_price
## class : SpatRaster
## dimensions : 2669, 2669, 1 (nrow, ncol, nlyr)
## resolution : 500, 500 (x, y)
## extent : -668509.2, 665990.8, -671388, 663112 (xmin, xmax, ymin, ymax)
## coord. ref. : +proj=laea +lat_0=-6 +lon_0=35 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs
## source(s) : memory
## name : lyr1
## min value : 924.9017
## max value : 1454.4166
## class : SpatRaster
## dimensions : 2669, 2669, 1 (nrow, ncol, nlyr)
## resolution : 500, 500 (x, y)
## extent : -668509.2, 665990.8, -671388, 663112 (xmin, xmax, ymin, ymax)
## coord. ref. : +proj=laea +lat_0=-6 +lon_0=35 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs
## source(s) : memory
## name : max
## min value : 0.000
## max value : 1224.889
terra::compareGeom(maize_price_jan_laea, fgate_price)
## [1] TRUE
names(fgate_price) <- "Farm_Gate_Price"
plot(fgate_price, main="Farm gate prices Jan 2024")
names(maize_price_jan_laea) <- "Pred_Maize_price_jan"
maize_values_jan <- terra::values(maize_price_jan_laea, na.rm = TRUE)
# Get minimum and maximum values
min_maize_jan <- min(maize_values_jan, na.rm = TRUE)
max_maize_jan <- max(maize_values_jan, na.rm = TRUE)
# Define the break interval for plotting
break_interval <- 100
breaks_seq <- seq(min_maize_jan, max_maize_jan, by = break_interval)
color_palette <- rev(terrain.colors(100))
# Plot
terra::plot(
maize_price_jan_laea,
main = "Predicted Maize Price Jan 2024",
zlim = c(min_maize_jan, max_maize_jan),
col = color_palette,
breaks = breaks_seq
)
# Set up the layout for 2 plots side by side
par(mfrow = c(1, 2), mar = c(4, 4, 2, 1))
# Plot Maize Price
plot(
maize_price_jan_laea,
main = "Predicted Maize Price Jan 2024",
zlim = c(min_maize_jan, max_maize_jan),
col = color_palette,
breaks = breaks_seq
)
# Plot Farm Gate Price
plot(fgate_price, main="Farm gate prices Jan 2024")