knitr::opts_chunk$set(warning=F, message=F, results=F, dev='svg')library(tidyverse) #loads multiple packages (see https://tidyverse.tidyverse.org/)#core tidyverse packages loaded:# ggplot2, for data visualisation. https://ggplot2.tidyverse.org/# dplyr, for data manipulation. https://dplyr.tidyverse.org/# tidyr, for data tidying. https://tidyr.tidyverse.org/# readr, for data import. https://readr.tidyverse.org/# purrr, for functional programming. https://purrr.tidyverse.org/# tibble, for tibbles, a modern re-imagining of data frames. https://tibble.tidyverse.org/# stringr, for strings. https://stringr.tidyverse.org/# forcats, for factors. https://forcats.tidyverse.org/# lubridate, for date/times. https://lubridate.tidyverse.org/#also loads the following packages (less frequently used):# Working with specific types of vectors:# hms, for times. https://hms.tidyverse.org/# Importing other types of data:# feather, for sharing with Python and other languages. https://github.com/wesm/feather# haven, for SPSS, SAS and Stata files. https://haven.tidyverse.org/# httr, for web apis. https://httr.r-lib.org/# jsonlite for JSON. https://arxiv.org/abs/1403.2805# readxl, for .xls and .xlsx files. https://readxl.tidyverse.org/# rvest, for web scraping. https://rvest.tidyverse.org/# xml2, for XML. https://xml2.r-lib.org/# Modelling# modelr, for modelling within a pipeline. https://modelr.tidyverse.org/# broom, for turning models into tidy data. https://broom.tidymodels.org/# Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors#loading additional relevant packageslibrary(cowplot) #for plot_grid()#setting graphs themetheme_set(theme_minimal() +theme(plot.title =element_text(face="bold") ) )#setting viridis theme for colorsscale_colour_continuous <- scale_colour_viridis_cscale_colour_discrete <- scale_colour_viridis_dscale_colour_binned <- scale_colour_viridis_b#setting viridis theme for fillscale_fill_continuous <- scale_fill_viridis_cscale_fill_discrete <- scale_fill_viridis_dscale_fill_binned <- scale_fill_viridis_bSource <-"Agence de l'Eau Rhône-Méditerranée Corse\nComputation by Thomas Starck"Year_analysis <-2020# Load the function filesource("functions.R")
the 2017 file related to WWTP description is not present in the data (apparently replaced by the performance file by error).
We load the files related to WWTP performances.
We consider incoming NGL and NTK are the same (only NTK reported before 2009 and only NGL after 2009). This approximation does not hold for outgoing NTK and NGL.
Code
#now need to read separately files 1999-2007 vs 2009-2020 (no file for 2008), because they are not reported in the same columns format#path for data path_source <-"source_data/06_rhone_mediterranee/performance_STEU/"#reading files for years 1999-2007file_1999_2007 <-list.files( #read and merge csv of all yearspath = path_source,pattern =c("1999|2000|2001|2002|2003|2004|2005|2006|2007"), full.names = T, recursive = T) %>%lapply(read_csv2, locale=locale(encoding="latin1")) %>%bind_rows()file_1999_2007 <- file_1999_2007 %>%distinct() #read and merge csv of all yearsfile_2008_2020 <-list.files(path = path_source,pattern =c("2009|2010|2011|2012|2013|2014|2015|2016|2017|2018|2019|2020"), full.names = T, recursive = T) %>%lapply(read_csv2, locale=locale(encoding="latin1")) %>%bind_rows()file_2008_2020 <- file_2008_2020 %>%distinct()# ON PEUT ENCORE SELECTIONNERNOM ET CODE COMMUNE, DEP, REG ET ZONE HYDROGRAPHIQUEN_P_rhone_mediterranee_1999_2007 <- file_1999_2007 %>%select(Year = Année, code_WWTP = code_STEP, name_WWTP = Nom_STEP, name_commune = Nom_commune,treatment_type = Dispositif_traitement,capacity = Capacité_traitement, PE_mean_in = Quantité_pollution_entrante,PE_mean_out = Quantité_pollution_sortante, MES_in = Quantité_MES_entrante,MES_out = Quantité_MES_sortante,Pt_in = Quantité_P_entrante,Pt_out = Quantité_P_sortante,NTK_in = Quantité_NR_entrante, NTK_out = Quantité_NR_sortante ) %>%mutate(NGL_in = NTK_in,NGL_out =NA,DBO5_in = PE_mean_in*0.06,DBO5_out = PE_mean_out*0.06 )#ON PEUT ENCORE SELECTIONNER NOM COMMUNE, DEP, REGN_P_rhone_mediterranee_2008_2020 <- file_2008_2020 %>%select(Year = ANNEE, code_WWTP =`CODE NATIONAL STATION`, name_WWTP =`NOM STATION`, name_commune =`NOM COMMUNE IMPLANTATION STATION`,capacity =`CAPACITE STATION EN EH`, DBO5_in =`FLUX EN ENTREE DE STATION - DBO5`, DBO5_out =`FLUX EN SORTIE DE STATION - DBO5`, DCO_in =`FLUX EN ENTREE DE STATION - DCO`,DCO_out =`FLUX EN SORTIE DE STATION - DCO`,MES_in =`FLUX EN ENTREE DE STATION - MES`,MES_out =`FLUX EN SORTIE DE STATION - MES`,NGL_in =`FLUX EN ENTREE DE STATION - NGL`,NGL_out =`FLUX EN SORTIE DE STATION - NGL`,Pt_in =`FLUX EN ENTREE DE STATION - PT`,Pt_out =`FLUX EN SORTIE DE STATION - PT` ) %>%mutate(NTK_in = NGL_in,NTK_out =NA,treatment_type =NA, )N_P_rhone_mediterranee <-bind_rows( N_P_rhone_mediterranee_1999_2007, N_P_rhone_mediterranee_2008_2020 )
Code
#finding double reportingdoublons_1999_2007 <-N_P_rhone_mediterranee_1999_2007 %>%select(Year, code_WWTP) %>%count(Year, code_WWTP) %>%filter(n !=1)doublons_1999_2007 <-inner_join( doublons_1999_2007, N_P_rhone_mediterranee_1999_2007,by =c("Year", "code_WWTP"))#no doublons in 2008-2020 ; uncomment to check# doublons_2008_2020 <-N_P_rhone_mediterranee_2008_2020 %>%# select(Year, code_WWTP) %>%# count(Year, code_WWTP) %>%# filter(n !=1)#for now just remove, see later if we can keep values#remove double reported lines from main fileN_P_rhone_mediterranee_1999_2007 <-anti_join( N_P_rhone_mediterranee_1999_2007, doublons_1999_2007 )
loading files of WWTP descriptions
Code
path_source <-"source_data/06_rhone_mediterranee/description_STEU/"# need to load separately the years because different format reporting of the files# + 2017 file is missing (they provided the performance instead)file_WWTP_1999_2007 <-list.files(path = path_source,pattern =c("1999|2000|2001|2002|2003|2004|2005|2006|2007"), full.names = T, recursive = T) %>%lapply(read_csv2, locale=locale(encoding="latin1")) %>%bind_rows()file_WWTP_2008_2013 <-list.files(path = path_source,pattern =c("2008|2009|2010|2011|2012|2013"), full.names = T, recursive = T ) %>%lapply( read_csv2, locale=locale(encoding="latin1")) %>%bind_rows()file_WWTP_2014_2020 <-list.files(path = path_source,pattern =c("2014|2015|2016|2018|2019|2020"), full.names = T, recursive = T ) %>%lapply( read_csv2, locale=locale(encoding="latin1") ) %>%bind_rows()#Eroor for 2017, they supply the performance file and not the description file => no data#ON PEUT ENCORE SELECTIONNER LA PRECISION DU REJET, CODE ET NOM COM, DEP, REG, ZONE HYDROWWTP_1999_2007 <- file_WWTP_1999_2007 %>%select(Year = Année,code_WWTP = Code_step,name_WWTP = Nom_step, capacity = Capacité_traitement,treatment_type = Dispositif_traitement,lat_WWTP = CoordonnéeX_step,long_WWTP = CoordonnéeY_step,lat_discharge = CoordonnéeX_rejet,long_discharge = CoordonnéeY_rejet,INSEE_COM = Code_département ) #ON PEUT ENCORE SELECTIONNER LA PRECISION DU REJET, CODE ET NOM COM, DEP, REG, ZONE HYDROWWTP_2008_2013 <- file_WWTP_2008_2013 %>%select(Year = Année,code_WWTP = code_STEP,name_WWTP = Nom_STEP, capacity = Capacité_traitement,treatment_type = Dispositif_traitement,lat_WWTP = Coordonnées_STEP_X,long_WWTP = Coordonnées_STEP_Y,lat_discharge = Coordonnées_REJET_X,long_discharge = Coordonnées_REJET_X,INSEE_COM = Code_Département_STEP )#ON PEUT ENCORE SELECTIONNER LA PRECISION DU REJET, CODE ET NOM COM, DEP, REG, ZONE HYDROWWTP_2014_2020 <- file_WWTP_2014_2020 %>%select(Year = Année,code_WWTP = code_STEP,name_WWTP = Nom_STEP, capacity = Capacité_EH, #apparently in kgDBO5 ?treatment_type = Dispositif_traitement,lat_WWTP = Coordonnées_STEP_X,long_WWTP = Coordonnées_STEP_Y,lat_discharge = Coordonnées_REJET_X,long_discharge = Coordonnées_REJET_X,INSEE_COM = Code_Département_STEP )WWTP <-bind_rows(WWTP_1999_2007, WWTP_2008_2013, WWTP_2014_2020)
Careful, there are a lot of double reporting starting 2008 ! We clean it.
Code
# Careful, a lot of doubel reporting !!!! check it in this this filedouble_reporting_WWTP <- WWTP %>%select(Year, code_WWTP) %>%count(Year, code_WWTP) %>%filter(n !=1)# we remove the double reportingWWTP <- WWTP %>%distinct()
There remains a few double reporting, with different values. In the WWTP description files, these are :
Roussilon (code SANDRE 060984102001) in 1999: remove the one with capacity 350 (kept: 1100)
BISINCHI (code SANDRE 060920039001) in 2001: remove the one with capacity 500 (kept: 600)
ST ROMAIN DE JALIONAS (code SANDRE 060938451001) in 2001: remove the one with capacity 3200 (kept: 9000)
SARRAS - CHAMPIALET (code SANDRE 060907308002) in 2005: remove the one with capacity 50 (kept: 30)
AREGNO (code SANDRE 060920020001) in 2005 : remove the one with capacity 400 (kept: 9500)
JAILLANS - CHEF LIEU (code SANDRE 060926381001) in 2006: remove the one with capacity 200 (kept: 560)
ST APPOLINAIRE (code SANDRE 060969181001) in 2007 : remove the one with capacity 150 (kept: 170)
Montpellier (Maera) (code SANDRE 060934172001) in 2018, 2019, 2020: remove the one with unreported discharge coordinates
For the WWTP performances files, these are almost the same errors (except for Montpellier which is not double reported here). We correct both files.
We check reporting coherence of the 2 files. Starting 2010, the WWTP description file reports more WWTP (left), but in terms of total reported capacity the 2 are very similar (because the unreported stations are very small)..
#only 1 WWTP not reporting capacity each year in the CAT_descrip_step files => OKtest <- WWTP %>%filter(is.na(capacity)==T) %>%group_by(Year) %>%summarise(n=n())# <50 WWTP not reporting capacity each year in CAT_perfostep files files => OKtest <- N_P_rhone_mediterranee %>%filter(is.na(capacity)==T) %>%group_by(Year) %>%summarise(n=n())rm(test)
We merge the 2 files. For the capacities, we give priority to the one reported in the perf file, and if it is empty, we use the one in the description file.
Code
#first wee keep the capacity, treatment type and name reported in both filesN_P_rhone_mediterranee <-left_join( N_P_rhone_mediterranee %>%rename(treatment_type_perf = treatment_type,capacity_perf = capacity,name_WWTP_perf = name_WWTP ), WWTP %>%rename(treatment_type_descr = treatment_type,capacity_descr = capacity,name_WWTP_descr = name_WWTP ), by=c("Year", "code_WWTP") )#we give priority to the perf file, and if empty we take the values of descr fileN_P_rhone_mediterranee <- N_P_rhone_mediterranee %>%mutate(capacity_perf =case_when(is.na(capacity_perf)==T ~ capacity_descr, T ~ capacity_perf ),treatment_type_perf =case_when(is.na(treatment_type_perf)==T ~ treatment_type_descr, T ~ treatment_type_perf ),name_WWTP_perf =case_when(is.na(name_WWTP_perf)==T ~ name_WWTP_descr, T ~ name_WWTP_perf ) )#We keep only the corrected onesN_P_rhone_mediterranee <- N_P_rhone_mediterranee %>%rename(capacity = capacity_perf,treatment_type = treatment_type_perf,name_WWTP = name_WWTP_perf ) %>%select(-capacity_descr, -treatment_type_descr, -name_WWTP_descr )
If nutrient flows are negative or null we replace them with empty values
Code
# Some reported flows are negative or null. We replace them with empty values.#PtN_P_rhone_mediterranee$Pt_in[N_P_rhone_mediterranee$Pt_in <=0] <-NAN_P_rhone_mediterranee$Pt_out[N_P_rhone_mediterranee$Pt_out <=0] <-NA#DBO5N_P_rhone_mediterranee$DBO5_in[N_P_rhone_mediterranee$DBO5_in <=0] <-NAN_P_rhone_mediterranee$DBO5_out[N_P_rhone_mediterranee$DBO5_out <=0] <-NA#DCON_P_rhone_mediterranee$DCO_in[N_P_rhone_mediterranee$DCO_in <=0] <-NAN_P_rhone_mediterranee$DCO_out[N_P_rhone_mediterranee$DCO_out <=0] <-NA#MESN_P_rhone_mediterranee$MES_in[N_P_rhone_mediterranee$MES_in <=0] <-NAN_P_rhone_mediterranee$MES_out[N_P_rhone_mediterranee$MES_out <=0] <-NA#NTKN_P_rhone_mediterranee$NTK_in[N_P_rhone_mediterranee$NTK_in <=0] <-NAN_P_rhone_mediterranee$NTK_out[N_P_rhone_mediterranee$NTK_out <=0] <-NA#NGLN_P_rhone_mediterranee$NGL_in[N_P_rhone_mediterranee$NGL_in <=0] <-NAN_P_rhone_mediterranee$NGL_out[N_P_rhone_mediterranee$NGL_out <=0] <-NA
Some WWTP have unreported capacities. For the most recent of them we are able to get their capacity from the sanitation portal data.
Code
#get the list of WWTP with unreported capacitiesunreported_capacity <- N_P_rhone_mediterranee %>%filter(is.na(capacity))#get capacities from the sanitation portalsanitation_portal_capacity <-read_csv("output_data/all_WWTP/all_WWTP_sanitation_portal.csv") %>%select(code_WWTP, capacity) %>%distinct()#change value when possibleunreported_capacity <-left_join( unreported_capacity %>%rename(capacity_LB = capacity), sanitation_portal_capacity %>%select(code_WWTP, capacity), by="code_WWTP") %>%select(-capacity_LB)#we change the values in the main file.N_P_rhone_mediterranee <- N_P_rhone_mediterranee %>%filter(is.na(capacity)==F)N_P_rhone_mediterranee <-bind_rows( N_P_rhone_mediterranee, unreported_capacity)
In spite of this correction, there remains some unreported capacities but it is negligible (cf non discernable areas near 0 in the 2 graphs below).
f_graph_nutrient <-function(dataset, nutrient_in, nutrient_out, label, legend_x, legend_y){ p <-ggplot(dataset) +#nutrient inflowgeom_line(aes( Year, !!as.symbol(nutrient_in), color=nutrient_in ) ) +#nutrient outflowgeom_line(aes( Year,!!as.symbol(nutrient_out), color = nutrient_out ) ) +ylim(0, NA) +theme(legend.position =c(legend_x, legend_y), legend.title =element_blank() ) +labs(x="", y=paste("kt of", label) , title =paste("Reported", label, "flows in Rhône-Méditerranée WWTPs") ,subtitle ="reported, not necessarily actual ; here before data cleaning", caption = Source )return(p)}##if want to try interactive plot :# library(plotly)# ggplotly(p, tooltip = c("y", "x"))
We correct 1 large outliers to be able to visualize the flows : incoming NGL and NTK in2018 for STATION D’EPURATION DE NARBONNE - VILLE (code SANDRE 060911262001): error of 4 ODM
Pt in - 2019, STATION D’EPURATION DE BESSEGES, code SANDRE 060930037002 - 2010, STATION D’EPURATION DE ALES, code SANDRE 060930259003 - 2019, STATION D’EPURATION DE ST AMBROIX - MAS CHABERT, code SANDRE 060930227002 - 2006, DIJON (EAUVITALE), code SANDRE 060921231001
Pt_out - 2019, STATION D’EPURATION DE ST AMBROIX - MAS CHABERT, code SANDRE 060930227002 - 2019, STATION D’EPURATION DE BESSEGES, code SANDRE 060930037002 - 2014, STATION D’EPURATION DE SCIENTRIER, code SANDRE 060974220001
Pt in - 2019 and 2020, STATION D’EPURATION DE BESSEGES, code SANDRE 060930037002, 3 OM - 2010 and 2009, STATION D’EPURATION DE ALES, code SANDRE 060930259003, inconsistent = >removed - 2019, STATION D’EPURATION DE ST AMBROIX - MAS CHABERT, code SANDRE 060930227002 , 3 OM - 2006, DIJON (EAUVITALE), code SANDRE 060921231001, 1 OM
Pt_out - 2019, STATION D’EPURATION DE ST AMBROIX - MAS CHABERT, code SANDRE 060930227002, 3 OM - 2019 and 2020, STATION D’EPURATION DE BESSEGES, code SANDRE 060930037002, 3 OM - 2013 and 2014, STATION D’EPURATION DE SCIENTRIER, code SANDRE 060974220001, 1 and 2 OM
temp <- N_P_rhone_mediterranee %>%group_by(Year) %>%summarise(capacity =sum(capacity, na.rm = T)/10^6, #capacity in million PEnb_WWTP =n() )
Code
coef <-max(temp$capacity)/max(temp$nb_WWTP)ggplot(temp) +geom_line(aes( Year, nb_WWTP, color ="number of reported facilities (left)" ) ) +geom_line(aes( Year, capacity/coef, color ="total reported capacity (right)" ) ) +scale_y_continuous(limits =c(0, NA),sec.axis =sec_axis(trans=~.*coef, name="million Population Equivalent" ) ) +labs(title ="Evolution of the reporting in the database",subtitle ="in terms of number of WWTP reported and total reported capacity",y="", x="", color="", caption =Source ) +theme(legend.position =c(0.7, 0.3) )
Code
temp <- N_P_rhone_mediterranee %>%filter(is.na(capacity)==F) %>%select(Year, capacity, PE_bin) %>%group_by(Year, PE_bin) %>%summarise(`capacity (million PE)`=sum(capacity)/10^6,`number of stations`=n() ) %>%gather(key=capacity_or_n, value = value, `capacity (million PE)`, `number of stations`)
ggplot(temp) +geom_area(aes(Year, value, fill=PE_bin)) +facet_wrap(vars(capacity_or_n), scales="free") +labs(title="Reporting in the database",subtitle ="For each capacity category",x="", y="", fill="nominal capacity \n(Population Equivalent)",caption = Source )
Code
ggplot(temp) +geom_area(aes(Year, value, fill=PE_bin), position ="fill") +facet_wrap(vars(capacity_or_n), scales="free") +labs(title="Reporting in the database",subtitle ="Proportion of each capacity category",x="", y="", fill="nominal capacity \n(Population Equivalent)",caption = Source )
Code
temp <- N_P_rhone_mediterranee %>%filter(Year==Year_analysis)ggplot(temp) +geom_histogram(aes( capacity, fill ="Nb of facilities" ), n=100, alpha=.4, stat="density" ) +geom_histogram(aes( capacity, weight = capacity, fill="Nb of facilities weighted by capacity" ), n=100, alpha=.4, stat="density" ) +theme(legend.position =c(0.7,0.8), ) +labs(x="Waste Water Treatment Plant Capacity \n(Population Equivalent)",y="Distribution density",fill="Distribution of",title =paste("WWTP capacities distribution", as.character(Year_analysis)),subtitle ="raw of weighted by capacity" ) +scale_x_log10(labels = scales::label_number(drop0trailing =TRUE) )
#changing the graph function to change the subtitle (before data cleaning => after data cleaning)f_graph_nutrient <-function(dataset, nutrient_in, nutrient_out, label, legend_x, legend_y){ p <-ggplot(dataset) +#nutrient inflowgeom_line(aes( Year, !!as.symbol(nutrient_in), color=nutrient_in ) ) +#nutrient outflowgeom_line(aes( Year,!!as.symbol(nutrient_out), color = nutrient_out ) ) +ylim(0, NA) +theme(legend.position =c(legend_x, legend_y), legend.title =element_blank() ) +labs(x="", y=paste("kt of", label) , title =paste("Reported", label, "flows in Rhône-Méditerranée WWTPs") ,subtitle ="reported, not necessarily actual ; here after data cleaning", caption = Source ) return(p)}
We compute in terms of installed capacity the reported and unreported flows for NGL, Pt, DBO5, DCO and MES. We do this for each year and for each capacity category.
Code
#create file of reported temp <- N_P_rhone_mediterranee %>%select( Year, PE_bin, capacity, Pt_in, Pt_out, NGL_in, NGL_out, DBO5_in, DBO5_out, DCO_in, DCO_out, MES_in, MES_out ) %>%#spots unreported values for each nutrient flowmutate(across(c(Pt_in, Pt_out, NGL_in, NGL_out, DBO5_in, DBO5_out, DCO_in, DCO_out, MES_in, MES_out),~is.na(.x)==F ) ) %>%#gather to ba able to then group by flow and count capacitygather(key=nutrient_flow, value = reported_pollution, Pt_in, Pt_out, NGL_in, NGL_out, DBO5_in, DBO5_out, DCO_in, DCO_out, MES_in, MES_out ) %>%#count reported capacity and unreported capacity for each (Year, capacity category, nutrient flow)group_by( Year, PE_bin, nutrient_flow, reported_pollution ) %>%summarise(capacity =sum(capacity, na.rm=T)/10^6 ) %>%#creates reported/unreported names for each nutrient flow and spreads into columnsmutate(nutrient_flow =case_when( reported_pollution == T ~paste0(nutrient_flow, "_reported"), reported_pollution == F ~paste0(nutrient_flow, "_unreported") ) ) %>%select(-reported_pollution) %>%spread(nutrient_flow, capacity)# NA values replaced by 0 for future coeff computationtemp[is.na(temp)] <-0
From this we compute proportionate coefficient to extrapolate real flows.
We add these adjusted flows to the main files reporting flows at the basin scale
Code
#adding adjusted flows to the basin x capacity filesbasin_PE_N_P_rhone_mediterranee <-left_join( basin_PE_N_P_rhone_mediterranee, temp2, by=c("Year", "PE_bin"))#aggregating adjusted flows at the basin scale without the capacity categoriestemp <- temp2 %>%select(-PE_bin) %>%group_by(Year) %>%summarise_all(~signif(sum(.x), 3))#adding adjusted flows to the basin filesbasin_N_P_rhone_mediterranee <-left_join( basin_N_P_rhone_mediterranee, temp, by="Year")
We plot the comparison reported / adjusted in the following graphs.
We save the data. We consider all the absolute flows to be unreliable before 2009, and so we do not save them. We also remove the ratios nutrient_flow:capacities.
Before 2009, we also all data that include outgoing DBO5 to be unreliable. This includes:
outgoing DBO5:Pt ratio
DBO5 yield
(not need for DBO5:DCO and DBO5:NGL since outgoing NGL and DCO are not reported before 2009)