Rhone Méditerranee Corse

Code

knitr::opts_chunk$set(warning=F, message=F, results=F, dev='svg')

library(tidyverse) #loads multiple packages (see https://tidyverse.tidyverse.org/)

#core tidyverse packages loaded:
# ggplot2, for data visualisation. https://ggplot2.tidyverse.org/
# dplyr, for data manipulation. https://dplyr.tidyverse.org/
# tidyr, for data tidying. https://tidyr.tidyverse.org/
# readr, for data import. https://readr.tidyverse.org/
# purrr, for functional programming. https://purrr.tidyverse.org/
# tibble, for tibbles, a modern re-imagining of data frames. https://tibble.tidyverse.org/
# stringr, for strings. https://stringr.tidyverse.org/
# forcats, for factors. https://forcats.tidyverse.org/
# lubridate, for date/times. https://lubridate.tidyverse.org/

#also loads the following packages (less frequently used):
# Working with specific types of vectors:
#     hms, for times. https://hms.tidyverse.org/
# Importing other types of data:
#     feather, for sharing with Python and other languages. https://github.com/wesm/feather
#     haven, for SPSS, SAS and Stata files. https://haven.tidyverse.org/
#     httr, for web apis. https://httr.r-lib.org/
#     jsonlite for JSON. https://arxiv.org/abs/1403.2805
#     readxl, for .xls and .xlsx files. https://readxl.tidyverse.org/
#     rvest, for web scraping. https://rvest.tidyverse.org/
#     xml2, for XML. https://xml2.r-lib.org/
# Modelling
#     modelr, for modelling within a pipeline. https://modelr.tidyverse.org/
#     broom, for turning models into tidy data. https://broom.tidymodels.org/

# Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

#loading additional relevant packages
library(cowplot) #for plot_grid()


#setting graphs theme
theme_set(
  theme_minimal() +
    theme(
      plot.title = element_text(face="bold")
      )
  )

#setting viridis theme for colors
scale_colour_continuous <- scale_colour_viridis_c
scale_colour_discrete   <- scale_colour_viridis_d
scale_colour_binned     <- scale_colour_viridis_b
#setting viridis theme for fill
scale_fill_continuous <- scale_fill_viridis_c
scale_fill_discrete   <- scale_fill_viridis_d
scale_fill_binned     <- scale_fill_viridis_b

Source <- "Agence de l'Eau Rhône-Méditerranée Corse\nComputation by Thomas Starck"

Year_analysis <- 2020

# Load the function file
source("functions.R")

Source and Description

Some information can be found on the Guide de l’eau (water guides), here and here.

The Etat des lieux report ican be found here. There are 15.5 million inhabitants on the basin.

All data were downloaded here. We use 2 kind of data

performances steps (1999-2020), described here. Data about pollution flows in and out of WWTP.
descriptif STEU (1993-2020), described here. Description of WWTP.

the 2017 file related to WWTP description is not present in the data (apparently replaced by the performance file by error).

We load the files related to WWTP performances.

We consider incoming NGL and NTK are the same (only NTK reported before 2009 and only NGL after 2009). This approximation does not hold for outgoing NTK and NGL.

Code

#now need to read separately files 1999-2007 vs 2009-2020 (no file for 2008), because they are not reported in the same columns format

#path for data 
path_source <- "source_data/06_rhone_mediterranee/performance_STEU/"

#reading files for years 1999-2007
file_1999_2007 <- list.files( #read and merge csv of all years
    path = path_source,
    pattern = c("1999|2000|2001|2002|2003|2004|2005|2006|2007"), 
    full.names = T, recursive = T) %>%
    lapply(read_csv2, locale=locale(encoding="latin1")) %>% bind_rows()
file_1999_2007 <- file_1999_2007 %>% distinct() 

#read and merge csv of all years
file_2008_2020 <- list.files(
    path = path_source,
    pattern = c("2009|2010|2011|2012|2013|2014|2015|2016|2017|2018|2019|2020"), 
    full.names = T, recursive = T) %>%
    lapply(read_csv2, locale=locale(encoding="latin1")) %>% bind_rows()
file_2008_2020 <- file_2008_2020 %>% distinct()


# ON PEUT ENCORE SELECTIONNERNOM ET CODE COMMUNE, DEP, REG ET ZONE HYDROGRAPHIQUE
N_P_rhone_mediterranee_1999_2007 <- file_1999_2007 %>%
  select(
    Year = Année, 
    code_WWTP = code_STEP, 
    name_WWTP = Nom_STEP, 
    name_commune = Nom_commune,
    treatment_type = Dispositif_traitement,
    capacity = Capacité_traitement, 
    PE_mean_in = Quantité_pollution_entrante,
    PE_mean_out = Quantité_pollution_sortante, 
    MES_in = Quantité_MES_entrante,
    MES_out = Quantité_MES_sortante,
    Pt_in = Quantité_P_entrante,
    Pt_out = Quantité_P_sortante,
    NTK_in = Quantité_NR_entrante, 
    NTK_out = Quantité_NR_sortante
  ) %>%
  mutate(
    NGL_in = NTK_in,
    NGL_out = NA,
    DBO5_in = PE_mean_in*0.06,
    DBO5_out = PE_mean_out*0.06
  )

#ON PEUT ENCORE SELECTIONNER NOM COMMUNE, DEP, REG
N_P_rhone_mediterranee_2008_2020 <- file_2008_2020 %>%
  select(
    Year = ANNEE, 
    code_WWTP = `CODE NATIONAL STATION`, 
    name_WWTP = `NOM STATION`, 
    name_commune = `NOM COMMUNE IMPLANTATION STATION`,
    capacity = `CAPACITE STATION EN EH`, 
    DBO5_in = `FLUX EN ENTREE DE STATION - DBO5`, 
    DBO5_out = `FLUX EN SORTIE DE STATION - DBO5`, 
    DCO_in = `FLUX EN ENTREE DE STATION - DCO`,
    DCO_out = `FLUX EN SORTIE DE STATION - DCO`,
    MES_in = `FLUX EN ENTREE DE STATION - MES`,
    MES_out = `FLUX EN SORTIE DE STATION - MES`,
    NGL_in = `FLUX EN ENTREE DE STATION - NGL`,
    NGL_out = `FLUX EN SORTIE DE STATION - NGL`,
    Pt_in = `FLUX EN ENTREE DE STATION - PT`,
    Pt_out = `FLUX EN SORTIE DE STATION - PT` 
  ) %>%
  mutate(
    NTK_in = NGL_in,
    NTK_out = NA,
    treatment_type = NA,
  )

N_P_rhone_mediterranee <- 
  bind_rows(
    N_P_rhone_mediterranee_1999_2007, N_P_rhone_mediterranee_2008_2020
  )

Code

#finding double reporting
doublons_1999_2007 <-N_P_rhone_mediterranee_1999_2007 %>%
  select(Year, code_WWTP) %>%
  count(Year, code_WWTP) %>%
  filter(n !=1)

doublons_1999_2007 <- inner_join(
  doublons_1999_2007,
  N_P_rhone_mediterranee_1999_2007,
  by = c("Year", "code_WWTP")
)

#no doublons in 2008-2020 ; uncomment to check
# doublons_2008_2020 <-N_P_rhone_mediterranee_2008_2020 %>%
#   select(Year, code_WWTP) %>%
#   count(Year, code_WWTP) %>%
#   filter(n !=1)

#for now just remove, see later if we can keep values
#remove double reported lines from main file
N_P_rhone_mediterranee_1999_2007 <- 
  anti_join(
    N_P_rhone_mediterranee_1999_2007,
    doublons_1999_2007
    )

loading files of WWTP descriptions

Code

path_source <- "source_data/06_rhone_mediterranee/description_STEU/"
# need to load separately the years because different format reporting of the files
# + 2017 file is missing (they provided the performance instead)

file_WWTP_1999_2007 <- list.files(
    path = path_source,
    pattern = c("1999|2000|2001|2002|2003|2004|2005|2006|2007"), 
    full.names = T, recursive = T) %>%
    lapply(read_csv2, locale=locale(encoding="latin1")) %>% bind_rows()

file_WWTP_2008_2013 <- 
  list.files(
    path = path_source,
    pattern = c("2008|2009|2010|2011|2012|2013"), 
    full.names = T, recursive = T
    ) %>%
  lapply(
    read_csv2, locale=locale(encoding="latin1")) %>% 
  bind_rows()

file_WWTP_2014_2020 <- 
  list.files(
    path = path_source,
    pattern = c("2014|2015|2016|2018|2019|2020"), 
    full.names = T, recursive = T
    ) %>%
  lapply(
    read_csv2, locale=locale(encoding="latin1")
  ) %>%
  bind_rows()
#Eroor for 2017, they supply the performance file and not the description file => no data

#ON PEUT ENCORE SELECTIONNER LA PRECISION DU REJET, CODE ET NOM COM, DEP, REG, ZONE HYDRO
WWTP_1999_2007 <- file_WWTP_1999_2007 %>%
  select(
    Year = Année,
    code_WWTP = Code_step,
    name_WWTP = Nom_step, 
    capacity = Capacité_traitement,
    treatment_type = Dispositif_traitement,
    lat_WWTP = CoordonnéeX_step,
    long_WWTP = CoordonnéeY_step,
    lat_discharge = CoordonnéeX_rejet,
    long_discharge = CoordonnéeY_rejet,
    INSEE_COM = Code_département
  ) 
#ON PEUT ENCORE SELECTIONNER LA PRECISION DU REJET, CODE ET NOM COM, DEP, REG, ZONE HYDRO
WWTP_2008_2013 <- file_WWTP_2008_2013 %>%
  select(
    Year = Année,
    code_WWTP = code_STEP,
    name_WWTP = Nom_STEP, 
    capacity = Capacité_traitement,
    treatment_type = Dispositif_traitement,
    lat_WWTP = Coordonnées_STEP_X,
    long_WWTP = Coordonnées_STEP_Y,
    lat_discharge = Coordonnées_REJET_X,
    long_discharge = Coordonnées_REJET_X,
    INSEE_COM = Code_Département_STEP
  )

#ON PEUT ENCORE SELECTIONNER LA PRECISION DU REJET, CODE ET NOM COM, DEP, REG, ZONE HYDRO
WWTP_2014_2020 <- file_WWTP_2014_2020 %>%
  select(
    Year = Année,
    code_WWTP = code_STEP,
    name_WWTP = Nom_STEP, 
    capacity = Capacité_EH, #apparently in kgDBO5 ?
    treatment_type = Dispositif_traitement,
    lat_WWTP = Coordonnées_STEP_X,
    long_WWTP = Coordonnées_STEP_Y,
    lat_discharge = Coordonnées_REJET_X,
    long_discharge = Coordonnées_REJET_X,
    INSEE_COM = Code_Département_STEP
  )
WWTP <- 
  bind_rows(WWTP_1999_2007, WWTP_2008_2013, WWTP_2014_2020)

Careful, there are a lot of double reporting starting 2008 ! We clean it.

Code

# Careful, a lot of doubel reporting !!!! check it in this this file
double_reporting_WWTP <- WWTP %>%
  select(Year, code_WWTP) %>%
  count(Year, code_WWTP) %>%
  filter(n !=1)
# we remove the double reporting
WWTP <- WWTP %>% distinct()

There remains a few double reporting, with different values. In the WWTP description files, these are :

Roussilon (code SANDRE 060984102001) in 1999: remove the one with capacity 350 (kept: 1100)
BISINCHI (code SANDRE 060920039001) in 2001: remove the one with capacity 500 (kept: 600)
ST ROMAIN DE JALIONAS (code SANDRE 060938451001) in 2001: remove the one with capacity 3200 (kept: 9000)
SARRAS - CHAMPIALET (code SANDRE 060907308002) in 2005: remove the one with capacity 50 (kept: 30)
AREGNO (code SANDRE 060920020001) in 2005 : remove the one with capacity 400 (kept: 9500)
JAILLANS - CHEF LIEU (code SANDRE 060926381001) in 2006: remove the one with capacity 200 (kept: 560)
ST APPOLINAIRE (code SANDRE 060969181001) in 2007 : remove the one with capacity 150 (kept: 170)
Montpellier (Maera) (code SANDRE 060934172001) in 2018, 2019, 2020: remove the one with unreported discharge coordinates

For the WWTP performances files, these are almost the same errors (except for Montpellier which is not double reported here). We correct both files.

Code

doublons_WWTP <- WWTP %>%
  select(Year, code_WWTP) %>%
  count(Year, code_WWTP) %>%
  filter(n !=1) %>%
  left_join(WWTP, by=c("Year", "code_WWTP"))

doublons_N_P <- N_P_rhone_mediterranee %>%
  select(Year, code_WWTP) %>%
  count(Year, code_WWTP) %>%
  filter(n !=1) %>%
  left_join(N_P_rhone_mediterranee, by=c("Year", "code_WWTP"))

#the ones to be removed in WWTP
temp <- doublons_WWTP %>%
  filter(!(code_WWTP=="060984102001" & Year==1999 & capacity==350)) %>%
  filter(!(code_WWTP=="060920039001" & Year==2001 & capacity==500)) %>%
  filter(!(code_WWTP=="060938451001" & Year==2001 & capacity==3200)) %>%
  filter(!(code_WWTP=="060907308002" & Year==2005 & capacity==50)) %>%
  filter(!(code_WWTP=="060920020001" & Year==2005 & capacity==400)) %>%
  filter(!(code_WWTP=="060926381001" & Year==2006 & capacity==200)) %>%
  filter(!(code_WWTP=="060969181001" & Year==2007 & capacity==150)) %>%
  filter(!(code_WWTP=="060934172001" & is.na(lat_discharge)==T))

WWTP <- anti_join(WWTP, temp)

#the ones to be removed in N_P_rhone_med
temp <- doublons_WWTP %>%
  filter(!(code_WWTP=="060984102001" & Year==1999 & capacity==350)) %>%
  filter(!(code_WWTP=="060920039001" & Year==2001 & capacity==500)) %>%
  filter(!(code_WWTP=="060938451001" & Year==2001 & capacity==3200)) %>%
  filter(!(code_WWTP=="060907308002" & Year==2005 & capacity==50)) %>%
  filter(!(code_WWTP=="060920020001" & Year==2005 & capacity==400)) %>%
  filter(!(code_WWTP=="060926381001" & Year==2006 & capacity==200)) %>%
  filter(!(code_WWTP=="060969181001" & Year==2007 & capacity==150)) 

N_P_rhone_mediterranee <- anti_join(N_P_rhone_mediterranee, temp)

We check reporting coherence of the 2 files. Starting 2010, the WWTP description file reports more WWTP (left), but in terms of total reported capacity the 2 are very similar (because the unreported stations are very small)..

Code

temp <- WWTP %>%
  group_by(Year) %>%
  summarise(capacity=sum(capacity, na.rm = T)/10^6, n=n())

temp2 <- N_P_rhone_mediterranee %>%
  group_by(Year) %>%
  summarise(capacity=sum(capacity, na.rm = T)/10^6, n=n())

plot_grid(
  ggplot(temp) + 
    geom_line(aes(Year, n, color = "CAT_descrip_step files")) + 
    geom_line(data = temp2, aes(Year, n, color = "CAT_perfostep files")) + 
    ylim(0, NA) +
    labs(
      x="", y="", subtitle = "nb of WWTP", caption = ""
    ) +
    theme(
      legend.position = "none"
    ),
  ggplot(temp) + 
    geom_line(data = temp, aes(Year, capacity, color = "CAT_descrip_step files")) + 
    geom_line(data = temp2, aes(Year, capacity, color = "CAT_perfostep files")) + 
    ylim(0, NA) +
    labs(
      x="", y="", subtitle = "total capacity", caption =  Source
    ) +
    theme(
      legend.position = c(0.4, 0.3), 
      legend.title = element_blank()
    )
)

Code

#only 1 WWTP not reporting capacity each year in the CAT_descrip_step files => OK
test <- WWTP %>%
  filter(is.na(capacity)==T) %>%
  group_by(Year) %>%
  summarise(n=n())

# <50 WWTP not reporting capacity each year in CAT_perfostep files files => OK
test <- N_P_rhone_mediterranee %>%
  filter(is.na(capacity)==T) %>%
  group_by(Year) %>%
  summarise(n=n())

rm(test)

We merge the 2 files. For the capacities, we give priority to the one reported in the perf file, and if it is empty, we use the one in the description file.

Code

#first wee keep the capacity, treatment type and name reported in both files
N_P_rhone_mediterranee <- 
  left_join(
    N_P_rhone_mediterranee %>%
      rename(
        treatment_type_perf = treatment_type,
        capacity_perf = capacity,
        name_WWTP_perf = name_WWTP
        ), 
    WWTP %>%
      rename(
        treatment_type_descr = treatment_type,
        capacity_descr = capacity,
        name_WWTP_descr = name_WWTP
        ), 
    by=c("Year", "code_WWTP")
  )

#we give priority to the perf file, and if empty we take the values of descr file
N_P_rhone_mediterranee <- N_P_rhone_mediterranee %>%
  mutate(
    capacity_perf = case_when(
      is.na(capacity_perf)==T ~ capacity_descr,
      T ~ capacity_perf
    ),
    treatment_type_perf = case_when(
      is.na(treatment_type_perf)==T ~ treatment_type_descr,
      T ~ treatment_type_perf
    ),
    name_WWTP_perf = case_when(
      is.na(name_WWTP_perf)==T ~ name_WWTP_descr,
      T ~ name_WWTP_perf
    )
  )

#We keep only the corrected ones
N_P_rhone_mediterranee <- N_P_rhone_mediterranee %>%
  rename(
    capacity = capacity_perf,
    treatment_type = treatment_type_perf,
    name_WWTP = name_WWTP_perf
  ) %>%
  select(
    -capacity_descr, -treatment_type_descr, -name_WWTP_descr
  )

If nutrient flows are negative or null we replace them with empty values

Code

# Some reported flows are negative or null. We replace them with empty values.
#Pt
N_P_rhone_mediterranee$Pt_in[N_P_rhone_mediterranee$Pt_in <= 0] <- NA
N_P_rhone_mediterranee$Pt_out[N_P_rhone_mediterranee$Pt_out <= 0] <- NA
#DBO5
N_P_rhone_mediterranee$DBO5_in[N_P_rhone_mediterranee$DBO5_in <= 0] <- NA
N_P_rhone_mediterranee$DBO5_out[N_P_rhone_mediterranee$DBO5_out <= 0] <- NA
#DCO
N_P_rhone_mediterranee$DCO_in[N_P_rhone_mediterranee$DCO_in <= 0] <- NA
N_P_rhone_mediterranee$DCO_out[N_P_rhone_mediterranee$DCO_out <= 0] <- NA
#MES
N_P_rhone_mediterranee$MES_in[N_P_rhone_mediterranee$MES_in <= 0] <- NA
N_P_rhone_mediterranee$MES_out[N_P_rhone_mediterranee$MES_out <= 0] <- NA
#NTK
N_P_rhone_mediterranee$NTK_in[N_P_rhone_mediterranee$NTK_in <= 0] <- NA
N_P_rhone_mediterranee$NTK_out[N_P_rhone_mediterranee$NTK_out <= 0] <- NA
#NGL
N_P_rhone_mediterranee$NGL_in[N_P_rhone_mediterranee$NGL_in <= 0] <- NA
N_P_rhone_mediterranee$NGL_out[N_P_rhone_mediterranee$NGL_out <= 0] <- NA

Some WWTP have unreported capacities. For the most recent of them we are able to get their capacity from the sanitation portal data.

Code

#get the list of WWTP with unreported capacities
unreported_capacity <- N_P_rhone_mediterranee %>% filter(is.na(capacity))

#get capacities from the sanitation portal
sanitation_portal_capacity <- read_csv("output_data/all_WWTP/all_WWTP_sanitation_portal.csv") %>%
  select(code_WWTP, capacity) %>%
  distinct()

#change value when possible
unreported_capacity <- left_join(
  unreported_capacity %>% rename(capacity_LB = capacity), 
  sanitation_portal_capacity %>% 
    select(code_WWTP, capacity), 
  by="code_WWTP"
) %>%
  select(-capacity_LB)

#we change the values in the main file.
N_P_rhone_mediterranee <- N_P_rhone_mediterranee %>% 
  filter(is.na(capacity)==F)

N_P_rhone_mediterranee <- bind_rows(
  N_P_rhone_mediterranee, unreported_capacity
)

In spite of this correction, there remains some unreported capacities but it is negligible (cf non discernable areas near 0 in the 2 graphs below).

Code

temp <- N_P_rhone_mediterranee %>%
  group_by(Year) %>%
  summarise(
    Pt_in = sum(Pt_in, na.rm=T)*365/10^6,
    nb_WWTP = n()
  )
temp2 <- unreported_capacity %>%
  filter(is.na(capacity)) %>%
  group_by(Year) %>%
  summarise(
    Pt_in = sum(Pt_in, na.rm=T)*365/10^6,
    nb_WWTP = n()
  )

plot_grid(
  ggplot(temp) +
    geom_line(aes(Year, nb_WWTP)) +
    geom_area(data = temp2, aes(Year, nb_WWTP)) +
    labs(
      x="", y="",
      subtitle = "nb of wastewater treatment plants",
      title = "Stations not reporting their nominal capacity",
      caption = "\n"
    ),
  ggplot(temp) +
    geom_line(aes(Year, Pt_in)) +
    geom_area(data = temp2, aes(Year, Pt_in)) +
    labs(
      x="", y="kt per year",
      subtitle = "in terms of incoming Pt flows",
      title = "",
      caption = Source
    ) +
    ylim(0, 15)
)

We compute yield and capacity categories

Code

N_P_rhone_mediterranee <-
  N_P_rhone_mediterranee %>%
  mutate(
    #Computes yields (even though sometimes yield is already reported, but we compare the consistency)
    Pt_yield = (1-Pt_out/Pt_in)*100, #to compare with reported P yield
    NGL_yield = (1-NGL_out/NGL_in)*100,
    DBO5_yield =(1-DBO5_out/DBO5_in)*100, 
    DCO_yield =(1-DCO_out/DCO_in)*100,
    MES_yield =(1-MES_out/MES_in)*100,
    #nutrient ratios
    N_P_ratio_in = NGL_in/Pt_in, 
    N_P_ratio_out = NGL_out/Pt_out,
    DCO_DBO5_ratio_in = DCO_in/DBO5_in,
    DCO_DBO5_ratio_out = DCO_out/DBO5_out,
    DBO5_N_ratio_in = DBO5_in/NGL_in,
    DBO5_N_ratio_out = DBO5_out/NGL_out,
    DBO5_P_ratio_in = DBO5_in/Pt_in,
    DBO5_P_ratio_out = DBO5_out/Pt_out
  )

We create the capacity categories in terms of population equivalent.

Code

#function to create capacity categories
f_PE_bin_categories <- function(dataset){
  #Creating size categories
  dataset <- 
    dataset %>%
    mutate(
      PE_bin = case_when(
        capacity < 200 ~ "0 - 200 PE",
        (capacity >= 200) & (capacity <2000) ~ "200 - 2 000 PE",
        (capacity >= 2000) & (capacity <10000) ~ "2 000 - 10 000 PE",
        (capacity >= 10000) & (capacity <100000) ~ "10 000 - 100 000 PE",
        capacity >= 100000 ~ "> 100 000 PE",
        T ~ "unreported PE"
      )
    )
  
  #reorders treatment by their share of total capacity
  dataset$PE_bin <- 
    factor(
      dataset$PE_bin, 
      levels = 
        c("unreported PE",
          "0 - 200 PE", 
          "200 - 2 000 PE", 
          "2 000 - 10 000 PE",
          "10 000 - 100 000 PE", 
          "> 100 000 PE"
          )
        )
  return(dataset)
}
N_P_rhone_mediterranee <- f_PE_bin_categories(N_P_rhone_mediterranee)

We create the file with aggregated flows at the basin scale, and also by capacity categories.

Code

f_basin_flows <- function(dataset){
  basin <- dataset %>%
    group_by(Year) %>%
    summarise(
      across(
        c(
          NGL_in, NTK_in, Pt_in, DBO5_in, DCO_in, MES_in,
          NGL_out, NTK_out, Pt_out, DBO5_out, DCO_out, MES_out
        ),
        ~signif(sum(.x, na.rm = T)*365/10^6, digits=3)
      ),
    #nb of waste water treatment plant
    nb_WWTP = n(),
    #capacity converted in million Population Equivalent
    capacity = signif(sum(capacity, na.rm = T)/10^6, digits=3),
    )
  return(basin)
}
basin_N_P_rhone_mediterranee <- f_basin_flows(N_P_rhone_mediterranee)

f_basin_PE_flows <- function(dataset){
  basin <- dataset %>%
    group_by(Year, PE_bin) %>%
    summarise(
      across(
        c(
          NGL_in, NTK_in, Pt_in, DBO5_in, DCO_in, MES_in,
          NGL_out, NTK_out, Pt_out, DBO5_out, DCO_out, MES_out
        ),
        ~signif(sum(.x, na.rm = T)*365/10^6, digits=3)
      ),
    #nb of waste water treatment plant
    nb_WWTP = n(),
    #capacity converted in million Population Equivalent
    capacity = signif(sum(capacity, na.rm = T)/10^6, digits=3),
    )
  return(basin)
}
basin_PE_N_P_rhone_mediterranee <- f_basin_PE_flows(N_P_rhone_mediterranee)

We compute the nutrient ratios at the basin scale, and also by capacity categories.

Code

#have to do this in case 1 nutrient is more reported than the other, which would create a bia if we took the ratio of the already aggregated flows
f_nutrient_ratio_basin <- function(basin, dataset, nutrient1, nutrient2){
  temp <- dataset %>% 
    filter(
      is.na(!!as.symbol(nutrient1))==F & is.na(!!as.symbol(nutrient2))==F
      ) %>%
    group_by(Year) %>%
    summarise(
      nutrient_1 = sum(!!as.symbol(nutrient1), na.rm=T),
      nutrient_2 = sum(!!as.symbol(nutrient2), na.rm=T),
      ratio = signif(nutrient_1/nutrient_2, digits=3)
    ) %>%
    select(-nutrient_1, -nutrient_2)
  basin <- left_join(
    basin, temp, by="Year"
  )
  return(basin)
}

f_nutrient_ratio_basin_PE <- function(basin_PE, dataset, nutrient1, nutrient2){
  temp <- dataset %>% 
    filter(
      is.na(!!as.symbol(nutrient1))==F & is.na(!!as.symbol(nutrient2))==F
      ) %>%
    group_by(Year, PE_bin) %>%
    summarise(
      nutrient_1 = sum(!!as.symbol(nutrient1), na.rm=T),
      nutrient_2 = sum(!!as.symbol(nutrient2), na.rm=T),
      ratio = signif(nutrient_1/nutrient_2, digits=3)
    ) %>%
    select(-nutrient_1, -nutrient_2)
  basin_PE <- left_join(
    basin_PE, temp, by=c("Year", "PE_bin")
  )
  return(basin_PE)
}

f_all_nutrient_ratios_basin <- function(basin, dataset){
  basin <- basin %>%
    
    #nutrient ratios
    #N_P in
    f_nutrient_ratio_basin(dataset, "NGL_in", "Pt_in") %>%
    rename(N_P_ratio_in = ratio) %>%
    #N_P out
    f_nutrient_ratio_basin(dataset, "NGL_out", "Pt_out") %>%
    rename(N_P_ratio_out = ratio) %>%
    #DCO_DBO5 in
    f_nutrient_ratio_basin(dataset, "DCO_in", "DBO5_in") %>%
    rename(DCO_DBO5_ratio_in = ratio) %>%
    #DCO_DBO5 out
    f_nutrient_ratio_basin(dataset, "DCO_out", "DBO5_out") %>%
    rename(DCO_DBO5_ratio_out = ratio) %>%
    #DBO5_N in
    f_nutrient_ratio_basin(dataset, "DBO5_in", "NGL_in") %>%
    rename(DBO5_N_ratio_in = ratio) %>%
    #DBO5_N out
    f_nutrient_ratio_basin(dataset, "DBO5_out", "NGL_out") %>%
    rename(DBO5_N_ratio_out = ratio) %>%
    #DBO5_P in
    f_nutrient_ratio_basin(dataset, "DBO5_in", "Pt_in") %>%
    rename(DBO5_P_ratio_in = ratio) %>%
    #DBO5_P out
    f_nutrient_ratio_basin(dataset, "DBO5_out", "Pt_out") %>%
    rename(DBO5_P_ratio_out = ratio) %>%
    #DCO_N in
    f_nutrient_ratio_basin(dataset, "DCO_in", "NGL_in") %>%
    rename(DCO_N_ratio_in = ratio) %>%
    #DCO_N out
    f_nutrient_ratio_basin(dataset, "DCO_out", "NGL_out") %>%
    rename(DCO_N_ratio_out = ratio) %>%
    #DCO_P in
    f_nutrient_ratio_basin(dataset, "DCO_in", "Pt_in") %>%
    rename(DCO_P_ratio_in = ratio) %>%
    #DCO_P out
    f_nutrient_ratio_basin(dataset, "DCO_out", "Pt_out") %>%
    rename(DCO_P_ratio_out = ratio) %>%
    
    #capacity ratios
    #Pt_PE_in
    f_nutrient_ratio_basin(dataset, "Pt_in", "capacity") %>%
    rename(Pt_PE_ratio_in = ratio) %>%
    #Pt_PE_out
    f_nutrient_ratio_basin(dataset, "Pt_out", "capacity") %>%
    rename(Pt_PE_ratio_out = ratio) %>%
    #NGL_PE_in
    f_nutrient_ratio_basin(dataset, "NGL_in", "capacity") %>%
    rename(NGL_PE_ratio_in = ratio) %>%
    #NGL_PE_out
    f_nutrient_ratio_basin(dataset, "NGL_out", "capacity") %>%
    rename(NGL_PE_ratio_out = ratio) %>%
    #DBO5_PE_in
    f_nutrient_ratio_basin(dataset, "DBO5_in", "capacity") %>%
    rename(DBO5_PE_ratio_in = ratio) %>%
    #DBO5_PE_out
    f_nutrient_ratio_basin(dataset, "DBO5_out", "capacity") %>%
    rename(DBO5_PE_ratio_out = ratio) %>%
    #DCO_PE_in
    f_nutrient_ratio_basin(dataset, "DCO_in", "capacity") %>%
    rename(DCO_PE_ratio_in = ratio) %>%
    #DCO_PE_out
    f_nutrient_ratio_basin(dataset, "DCO_out", "capacity") %>%
    rename(DCO_PE_ratio_out = ratio) %>%
    #MES_PE_in
    f_nutrient_ratio_basin(dataset, "MES_in", "capacity") %>%
    rename(MES_PE_ratio_in = ratio) %>%
    #MES_PE_out
    f_nutrient_ratio_basin(dataset, "MES_out", "capacity") %>%
    rename(MES_PE_ratio_out = ratio) %>%
    
    #convert from kg per PE per day to g per PE per day
    mutate(
      across(
        c(
          Pt_PE_ratio_in, Pt_PE_ratio_out, NGL_PE_ratio_in, NGL_PE_ratio_out, DBO5_PE_ratio_in, DBO5_PE_ratio_out,
          DCO_PE_ratio_in, DCO_PE_ratio_out, MES_PE_ratio_in, MES_PE_ratio_out
        ), ~.x*1000 
      )
    )
  return(basin)
}

f_all_nutrient_ratios_basin_PE <- function(basin_PE, dataset){
  basin_PE <- basin_PE %>%
    
    #nutrient ratios
    #N_P in
    f_nutrient_ratio_basin_PE(dataset, "NGL_in", "Pt_in") %>%
    rename(N_P_ratio_in = ratio) %>%
    #N_P out
    f_nutrient_ratio_basin_PE(dataset, "NGL_out", "Pt_out") %>%
    rename(N_P_ratio_out = ratio) %>%
    #DCO_DBO5 in
    f_nutrient_ratio_basin_PE(dataset, "DCO_in", "DBO5_in") %>%
    rename(DCO_DBO5_ratio_in = ratio) %>%
    #DCO_DBO5 out
    f_nutrient_ratio_basin_PE(dataset, "DCO_out", "DBO5_out") %>%
    rename(DCO_DBO5_ratio_out = ratio) %>%
    #DBO5_N in
    f_nutrient_ratio_basin_PE(dataset, "DBO5_in", "NGL_in") %>%
    rename(DBO5_N_ratio_in = ratio) %>%
    #DBO5_N out
    f_nutrient_ratio_basin_PE(dataset, "DBO5_out", "NGL_out") %>%
    rename(DBO5_N_ratio_out = ratio) %>%
    #DBO5_P in
    f_nutrient_ratio_basin_PE(dataset, "DBO5_in", "Pt_in") %>%
    rename(DBO5_P_ratio_in = ratio) %>%
    #DBO5_P out
    f_nutrient_ratio_basin_PE(dataset, "DBO5_out", "Pt_out") %>%
    rename(DBO5_P_ratio_out = ratio) %>%
    #DCO_N in
    f_nutrient_ratio_basin_PE(dataset, "DCO_in", "NGL_in") %>%
    rename(DCO_N_ratio_in = ratio) %>%
    #DCO_N out
    f_nutrient_ratio_basin_PE(dataset, "DCO_out", "NGL_out") %>%
    rename(DCO_N_ratio_out = ratio) %>%
    #DCO_P in
    f_nutrient_ratio_basin_PE(dataset, "DCO_in", "Pt_in") %>%
    rename(DCO_P_ratio_in = ratio) %>%
    #DCO_P out
    f_nutrient_ratio_basin_PE(dataset, "DCO_out", "Pt_out") %>%
    rename(DCO_P_ratio_out = ratio) %>%
    
    #capacity ratios
    #Pt_PE_in
    f_nutrient_ratio_basin_PE(dataset, "Pt_in", "capacity") %>%
    rename(Pt_PE_ratio_in = ratio) %>%
    #Pt_PE_out
    f_nutrient_ratio_basin_PE(dataset, "Pt_out", "capacity") %>%
    rename(Pt_PE_ratio_out = ratio) %>%
    #NGL_PE_in
    f_nutrient_ratio_basin_PE(dataset, "NGL_in", "capacity") %>%
    rename(NGL_PE_ratio_in = ratio) %>%
    #NGL_PE_out
    f_nutrient_ratio_basin_PE(dataset, "NGL_out", "capacity") %>%
    rename(NGL_PE_ratio_out = ratio) %>%
    #DBO5_PE_in
    f_nutrient_ratio_basin_PE(dataset, "DBO5_in", "capacity") %>%
    rename(DBO5_PE_ratio_in = ratio) %>%
    #DBO5_PE_out
    f_nutrient_ratio_basin_PE(dataset, "DBO5_out", "capacity") %>%
    rename(DBO5_PE_ratio_out = ratio) %>%
    #DCO_PE_in
    f_nutrient_ratio_basin_PE(dataset, "DCO_in", "capacity") %>%
    rename(DCO_PE_ratio_in = ratio) %>%
    #DCO_PE_out
    f_nutrient_ratio_basin_PE(dataset, "DCO_out", "capacity") %>%
    rename(DCO_PE_ratio_out = ratio) %>%
    #MES_PE_in
    f_nutrient_ratio_basin_PE(dataset, "MES_in", "capacity") %>%
    rename(MES_PE_ratio_in = ratio) %>%
    #MES_PE_out
    f_nutrient_ratio_basin_PE(dataset, "MES_out", "capacity") %>%
    rename(MES_PE_ratio_out = ratio) %>%
    
    #convert from kg per PE per day to g per PE per day
    mutate(
      across(
        c(
          Pt_PE_ratio_in, Pt_PE_ratio_out, NGL_PE_ratio_in, NGL_PE_ratio_out, DBO5_PE_ratio_in, DBO5_PE_ratio_out,
          DCO_PE_ratio_in, DCO_PE_ratio_out, MES_PE_ratio_in, MES_PE_ratio_out
        ), ~.x*1000 
      )
    )
  return(basin_PE)
}

basin_N_P_rhone_mediterranee <- f_all_nutrient_ratios_basin(basin_N_P_rhone_mediterranee, N_P_rhone_mediterranee)

basin_PE_N_P_rhone_mediterranee <- f_all_nutrient_ratios_basin_PE(basin_PE_N_P_rhone_mediterranee, N_P_rhone_mediterranee)

We compute the yields at the basin scale, and also by capacity categories.

Code

#have to do this in case inflow or outflow is more reported than the other one, which would create a bias if we took the ratio of the already aggregated flows
f_yield_basin <- function(basin, dataset, nutrientIN, nutrientOUT){
  temp <- dataset %>% 
    filter(
      is.na(!!as.symbol(nutrientIN))==F & is.na(!!as.symbol(nutrientOUT))==F
      ) %>%
    group_by(Year) %>%
    summarise(
      nutrient_in = sum(!!as.symbol(nutrientIN), na.rm=T),
      nutrient_out = sum(!!as.symbol(nutrientOUT), na.rm=T),
      yield = round((1-nutrient_out/nutrient_in)*100, digits = 0)
    ) %>%
    select(-nutrient_in, -nutrient_out)
  basin <- left_join(
    basin, temp, by="Year"
  )
  return(basin)
}

f_yield_basin_PE <- function(basin_PE, dataset, nutrientIN, nutrientOUT){
  temp <- dataset %>% 
    filter(
      is.na(!!as.symbol(nutrientIN))==F & is.na(!!as.symbol(nutrientOUT))==F
      ) %>%
    group_by(Year, PE_bin) %>%
    summarise(
      nutrient_in = sum(!!as.symbol(nutrientIN), na.rm=T),
      nutrient_out = sum(!!as.symbol(nutrientOUT), na.rm=T),
      yield = round((1-nutrient_out/nutrient_in)*100, digits = 0)
    ) %>%
    select(-nutrient_in, -nutrient_out)
  basin_PE <- left_join(
    basin_PE, temp, by=c("Year", "PE_bin")
  )
  return(basin_PE)
}

f_all_yields_basin <- function(basin, dataset){
  basin <- basin %>%
    #NGL yield
    f_yield_basin(dataset, "NGL_in", "NGL_out") %>%
    rename(NGL_yield = yield) %>%
    #Pt yield
    f_yield_basin(dataset, "Pt_in", "Pt_out") %>%
    rename(Pt_yield = yield) %>%
    #DBO5 yield
    f_yield_basin(dataset, "DBO5_in", "DBO5_out") %>%
    rename(DBO5_yield = yield) %>%
    #DCO yield
    f_yield_basin(dataset, "DCO_in", "DCO_out") %>%
    rename(DCO_yield = yield) %>%
    #MES yield
    f_yield_basin(dataset, "MES_in", "MES_out") %>%
    rename(MES_yield = yield) 
  return(basin)
}

f_all_yields_basin_PE <- function(basin_PE, dataset){
  basin_PE <- basin_PE %>%
    #NGL yield
    f_yield_basin_PE(dataset, "NGL_in", "NGL_out") %>%
    rename(NGL_yield = yield) %>%
    #Pt yield
    f_yield_basin_PE(dataset, "Pt_in", "Pt_out") %>%
    rename(Pt_yield = yield) %>%
    #DBO5 yield
    f_yield_basin_PE(dataset, "DBO5_in", "DBO5_out") %>%
    rename(DBO5_yield = yield) %>%
    #DCO yield
    f_yield_basin_PE(dataset, "DCO_in", "DCO_out") %>%
    rename(DCO_yield = yield) %>%
    #MES yield
    f_yield_basin_PE(dataset, "MES_in", "MES_out") %>%
    rename(MES_yield = yield) 
  return(basin_PE)
}

basin_N_P_rhone_mediterranee <- f_all_yields_basin(basin_N_P_rhone_mediterranee, N_P_rhone_mediterranee)

basin_PE_N_P_rhone_mediterranee <- f_all_yields_basin_PE(basin_PE_N_P_rhone_mediterranee, N_P_rhone_mediterranee)

We create the years categories (every 5 years).

Code

#function to create years categories
f_year_categories <- function(dataset){
  dataset <- dataset %>%
    mutate(
      Year_category = case_when(
        Year %in% c(1991, 1992, 1993, 1994, 1995) ~ "1991-1995",
        Year %in% c(1996, 1997, 1998, 1999, 2000) ~ "1996-2000",
        Year %in% c(2001, 2002, 2003, 2004, 2005) ~ "2001-2005",
        Year %in% c(2006, 2007, 2008, 2009, 2010) ~ "2006-2010",
        Year %in% c(2011, 2012, 2013, 2014, 2015) ~ "2011-2015",
        Year %in% c(2016, 2017, 2018, 2019, 2020) ~ "2016-2020",
      )
    )
  return(dataset)
}
N_P_rhone_mediterranee <- f_year_categories(N_P_rhone_mediterranee)
basin_N_P_rhone_mediterranee <- f_year_categories(basin_N_P_rhone_mediterranee)
basin_PE_N_P_rhone_mediterranee <- f_year_categories(basin_PE_N_P_rhone_mediterranee)

Code

rm(
  file_WWTP_1999_2007, file_WWTP_2008_2013, file_WWTP_2014_2020,
  file_1999_2007, file_2008_2020,
  WWTP_1999_2007, WWTP_2008_2013, WWTP_2014_2020,
  N_P_rhone_mediterranee_1999_2007, N_P_rhone_mediterranee_2008_2020,
  doublons_N_P, doublons_WWTP, doublons_1999_2007, double_reporting_WWTP,
  unreported_capacity,
  sanitation_portal_capacity
)

Data cleaning

Outliers : first visualization

Code

f_graph_nutrient <- function(dataset, nutrient_in, nutrient_out, label, legend_x, legend_y){
  p <- ggplot(dataset) + 
    #nutrient inflow
    geom_line(
      aes(
        Year, 
        !!as.symbol(nutrient_in), 
        color=nutrient_in
        )
      ) + 
    #nutrient outflow
    geom_line(
      aes(
        Year,
        !!as.symbol(nutrient_out), 
        color = nutrient_out
        )
      ) +
    ylim(0, NA) +
    theme(
      legend.position = c(legend_x, legend_y), 
      legend.title = element_blank()
      ) +
    labs(
      x="", y=paste("kt of", label) , 
      title = paste("Reported", label, "flows in Rhône-Méditerranée WWTPs") ,
      subtitle = "reported, not necessarily actual ; here before data cleaning", 
      caption = Source
      )
  return(p)
}
##if want to try interactive plot :
# library(plotly)
# ggplotly(p, tooltip = c("y", "x"))

We correct 1 large outliers to be able to visualize the flows : incoming NGL and NTK in2018 for STATION D’EPURATION DE NARBONNE - VILLE (code SANDRE 060911262001): error of 4 ODM

Code

N_P_rhone_mediterranee$NGL_in[N_P_rhone_mediterranee$code_WWTP == "060911262001" & N_P_rhone_mediterranee$Year == 2018] <- 
  N_P_rhone_mediterranee$NGL_in[N_P_rhone_mediterranee$code_WWTP == "060911262001" & N_P_rhone_mediterranee$Year == 2018]/10^4

N_P_rhone_mediterranee$NTK_in[N_P_rhone_mediterranee$code_WWTP == "060911262001" & N_P_rhone_mediterranee$Year == 2018] <- 
  N_P_rhone_mediterranee$NTK_in[N_P_rhone_mediterranee$code_WWTP == "060911262001" & N_P_rhone_mediterranee$Year == 2018]/10^4

We recompute the basin flows after this correction.

Code

N_P_rhone_mediterranee <-
  N_P_rhone_mediterranee %>%
  mutate(
    #Computes yields (even though sometimes yield is already reported, but we compare the consistency)
    Pt_yield = (1-Pt_out/Pt_in)*100, #to compare with reported P yield
    NGL_yield = (1-NGL_out/NGL_in)*100,
    DBO5_yield =(1-DBO5_out/DBO5_in)*100, 
    DCO_yield =(1-DCO_out/DCO_in)*100,
    MES_yield =(1-MES_out/MES_in)*100,
    #nutrient ratios
    N_P_ratio_in = NGL_in/Pt_in, 
    N_P_ratio_out = NGL_out/Pt_out,
    DCO_DBO5_ratio_in = DCO_in/DBO5_in,
    DCO_DBO5_ratio_out = DCO_out/DBO5_out,
    DBO5_N_ratio_in = DBO5_in/NGL_in,
    DBO5_N_ratio_out = DBO5_out/NGL_out,
    DBO5_P_ratio_in = DBO5_in/Pt_in,
    DBO5_P_ratio_out = DBO5_out/Pt_out
  )

#recompute basin values (flows, yields, ratios..)
basin_N_P_rhone_mediterranee <- f_basin_flows(N_P_rhone_mediterranee)
basin_N_P_rhone_mediterranee <- f_all_nutrient_ratios_basin(basin_N_P_rhone_mediterranee, N_P_rhone_mediterranee)
basin_N_P_rhone_mediterranee <- f_all_yields_basin(basin_N_P_rhone_mediterranee, N_P_rhone_mediterranee)
basin_N_P_rhone_mediterranee <- f_year_categories(basin_N_P_rhone_mediterranee)

#recompute basin x PE values (flows, yields, ratios..)
basin_PE_N_P_rhone_mediterranee <- f_basin_PE_flows(N_P_rhone_mediterranee)
basin_PE_N_P_rhone_mediterranee <- f_all_nutrient_ratios_basin_PE(basin_PE_N_P_rhone_mediterranee, N_P_rhone_mediterranee)
basin_PE_N_P_rhone_mediterranee <- f_all_yields_basin_PE(basin_PE_N_P_rhone_mediterranee, N_P_rhone_mediterranee)
basin_PE_N_P_rhone_mediterranee <- f_year_categories(basin_PE_N_P_rhone_mediterranee)


#we introduce empty values for unreported flows
#flows basin
basin_N_P_rhone_mediterranee$NGL_out[basin_N_P_rhone_mediterranee$Year < 2009] <- NA
basin_N_P_rhone_mediterranee$NTK_out[basin_N_P_rhone_mediterranee$Year > 2007] <- NA
basin_N_P_rhone_mediterranee$DCO_in[basin_N_P_rhone_mediterranee$Year < 2009] <- NA
basin_N_P_rhone_mediterranee$DCO_out[basin_N_P_rhone_mediterranee$Year < 2009] <- NA
#flows basin x PE
basin_PE_N_P_rhone_mediterranee$NGL_out[basin_PE_N_P_rhone_mediterranee$Year < 2009] <- NA
basin_PE_N_P_rhone_mediterranee$NTK_out[basin_PE_N_P_rhone_mediterranee$Year > 2007] <- NA
basin_PE_N_P_rhone_mediterranee$DCO_in[basin_PE_N_P_rhone_mediterranee$Year < 2009] <- NA
basin_PE_N_P_rhone_mediterranee$DCO_out[basin_PE_N_P_rhone_mediterranee$Year < 2009] <- NA

It also appears that in 2014, all flows of the Pia sation (code SANDRE 060966141002) are overestimated by a 3 OM.

Code

f_graph_nutrient(basin_N_P_rhone_mediterranee, "NGL_in", "NGL_out", "NGL", 0.7, 0.5)

Code

f_graph_nutrient(basin_N_P_rhone_mediterranee, "NTK_in", "NTK_out", "NTK", 0.7, 0.5)

outliers :

Pt in - 2019, STATION D’EPURATION DE BESSEGES, code SANDRE 060930037002 - 2010, STATION D’EPURATION DE ALES, code SANDRE 060930259003 - 2019, STATION D’EPURATION DE ST AMBROIX - MAS CHABERT, code SANDRE 060930227002 - 2006, DIJON (EAUVITALE), code SANDRE 060921231001
Pt_out - 2019, STATION D’EPURATION DE ST AMBROIX - MAS CHABERT, code SANDRE 060930227002 - 2019, STATION D’EPURATION DE BESSEGES, code SANDRE 060930037002 - 2014, STATION D’EPURATION DE SCIENTRIER, code SANDRE 060974220001

Code

f_graph_nutrient(basin_N_P_rhone_mediterranee, "Pt_in", "Pt_out", "Pt", 0.4, 0.5)

Code

f_graph_nutrient(basin_N_P_rhone_mediterranee, "DBO5_in", "DBO5_out", "DBO5", 0.4, 0.5)

Code

f_graph_nutrient(basin_N_P_rhone_mediterranee, "DCO_in", "DCO_out", "DCO", 0.4, 0.5)

Code

f_graph_nutrient(basin_N_P_rhone_mediterranee, "MES_in", "MES_out", "MES", 0.4, 0.5)

For Pia in 2014, we correct all flows by 3 ODM

Code

#Pia 2014, all flows
#MES_in
N_P_rhone_mediterranee$MES_in[N_P_rhone_mediterranee$code_WWTP == "060966141002" & N_P_rhone_mediterranee$Year == 2014] <- 
  N_P_rhone_mediterranee$MES_in[N_P_rhone_mediterranee$code_WWTP == "060966141002" & N_P_rhone_mediterranee$Year == 2014]/10^3
#MES_out
N_P_rhone_mediterranee$MES_out[N_P_rhone_mediterranee$code_WWTP == "060966141002" & N_P_rhone_mediterranee$Year == 2014] <- 
  N_P_rhone_mediterranee$MES_out[N_P_rhone_mediterranee$code_WWTP == "060966141002" & N_P_rhone_mediterranee$Year == 2014]/10^3
#Pt_in
N_P_rhone_mediterranee$Pt_in[N_P_rhone_mediterranee$code_WWTP == "060966141002" & N_P_rhone_mediterranee$Year == 2014] <- 
  N_P_rhone_mediterranee$Pt_in[N_P_rhone_mediterranee$code_WWTP == "060966141002" & N_P_rhone_mediterranee$Year == 2014]/10^3
#Pt_out
N_P_rhone_mediterranee$Pt_out[N_P_rhone_mediterranee$code_WWTP == "060966141002" & N_P_rhone_mediterranee$Year == 2014] <- 
  N_P_rhone_mediterranee$Pt_out[N_P_rhone_mediterranee$code_WWTP == "060966141002" & N_P_rhone_mediterranee$Year == 2014]/10^3
#NTK_in
N_P_rhone_mediterranee$NTK_in[N_P_rhone_mediterranee$code_WWTP == "060966141002" & N_P_rhone_mediterranee$Year == 2014] <- 
  N_P_rhone_mediterranee$NTK_in[N_P_rhone_mediterranee$code_WWTP == "060966141002" & N_P_rhone_mediterranee$Year == 2014]/10^3
#NGL_in
N_P_rhone_mediterranee$NGL_in[N_P_rhone_mediterranee$code_WWTP == "060966141002" & N_P_rhone_mediterranee$Year == 2014] <- 
  N_P_rhone_mediterranee$NGL_in[N_P_rhone_mediterranee$code_WWTP == "060966141002" & N_P_rhone_mediterranee$Year == 2014]/10^3
#NGL_out
N_P_rhone_mediterranee$NGL_out[N_P_rhone_mediterranee$code_WWTP == "060966141002" & N_P_rhone_mediterranee$Year == 2014] <- 
  N_P_rhone_mediterranee$NGL_out[N_P_rhone_mediterranee$code_WWTP == "060966141002" & N_P_rhone_mediterranee$Year == 2014]/10^3
#DBO5_in
N_P_rhone_mediterranee$DBO5_in[N_P_rhone_mediterranee$code_WWTP == "060966141002" & N_P_rhone_mediterranee$Year == 2014] <- 
  N_P_rhone_mediterranee$DBO5_in[N_P_rhone_mediterranee$code_WWTP == "060966141002" & N_P_rhone_mediterranee$Year == 2014]/10^3
#DBO5_out
N_P_rhone_mediterranee$DBO5_out[N_P_rhone_mediterranee$code_WWTP == "060966141002" & N_P_rhone_mediterranee$Year == 2014] <- 
  N_P_rhone_mediterranee$DBO5_out[N_P_rhone_mediterranee$code_WWTP == "060966141002" & N_P_rhone_mediterranee$Year == 2014]/10^3
#DCO_in
N_P_rhone_mediterranee$DCO_in[N_P_rhone_mediterranee$code_WWTP == "060966141002" & N_P_rhone_mediterranee$Year == 2014] <- 
  N_P_rhone_mediterranee$DCO_in[N_P_rhone_mediterranee$code_WWTP == "060966141002" & N_P_rhone_mediterranee$Year == 2014]/10^3
#DCO_out
N_P_rhone_mediterranee$DCO_out[N_P_rhone_mediterranee$code_WWTP == "060966141002" & N_P_rhone_mediterranee$Year == 2014] <- 
  N_P_rhone_mediterranee$DCO_out[N_P_rhone_mediterranee$code_WWTP == "060966141002" & N_P_rhone_mediterranee$Year == 2014]/10^3

The remaining errors are mainly for Pt

Pt in - 2019 and 2020, STATION D’EPURATION DE BESSEGES, code SANDRE 060930037002, 3 OM - 2010 and 2009, STATION D’EPURATION DE ALES, code SANDRE 060930259003, inconsistent = >removed - 2019, STATION D’EPURATION DE ST AMBROIX - MAS CHABERT, code SANDRE 060930227002 , 3 OM - 2006, DIJON (EAUVITALE), code SANDRE 060921231001, 1 OM
Pt_out - 2019, STATION D’EPURATION DE ST AMBROIX - MAS CHABERT, code SANDRE 060930227002, 3 OM - 2019 and 2020, STATION D’EPURATION DE BESSEGES, code SANDRE 060930037002, 3 OM - 2013 and 2014, STATION D’EPURATION DE SCIENTRIER, code SANDRE 060974220001, 1 and 2 OM

Code

#Pt in
#Besseges
N_P_rhone_mediterranee$Pt_in[N_P_rhone_mediterranee$code_WWTP == "060930037002" & N_P_rhone_mediterranee$Year == 2019] <- 
  N_P_rhone_mediterranee$Pt_in[N_P_rhone_mediterranee$code_WWTP == "060930037002" & N_P_rhone_mediterranee$Year == 2019]/10^3
N_P_rhone_mediterranee$Pt_in[N_P_rhone_mediterranee$code_WWTP == "060930037002" & N_P_rhone_mediterranee$Year == 2020] <- 
  N_P_rhone_mediterranee$Pt_in[N_P_rhone_mediterranee$code_WWTP == "060930037002" & N_P_rhone_mediterranee$Year == 2020]/10^3
#Ales
N_P_rhone_mediterranee$Pt_in[N_P_rhone_mediterranee$code_WWTP == "060930259003" & N_P_rhone_mediterranee$Year == 2009] <- NA
N_P_rhone_mediterranee$Pt_in[N_P_rhone_mediterranee$code_WWTP == "060930259003" & N_P_rhone_mediterranee$Year == 2010] <- NA
#St ambroix
N_P_rhone_mediterranee$Pt_in[N_P_rhone_mediterranee$code_WWTP == "060930227002" & N_P_rhone_mediterranee$Year == 2019] <- 
  N_P_rhone_mediterranee$Pt_in[N_P_rhone_mediterranee$code_WWTP == "060930227002" & N_P_rhone_mediterranee$Year == 2019]/10^3
#Dijon
N_P_rhone_mediterranee$Pt_in[N_P_rhone_mediterranee$code_WWTP == "060921231001" & N_P_rhone_mediterranee$Year == 2006] <- 
  N_P_rhone_mediterranee$Pt_in[N_P_rhone_mediterranee$code_WWTP == "060921231001" & N_P_rhone_mediterranee$Year == 2006]/10

#Pt out
#Saint Ambroix
N_P_rhone_mediterranee$Pt_out[N_P_rhone_mediterranee$code_WWTP == "060930227002" & N_P_rhone_mediterranee$Year == 2019] <- 
  N_P_rhone_mediterranee$Pt_out[N_P_rhone_mediterranee$code_WWTP == "060930227002" & N_P_rhone_mediterranee$Year == 2019]/10^3
#Besseges
N_P_rhone_mediterranee$Pt_out[N_P_rhone_mediterranee$code_WWTP == "060930037002" & N_P_rhone_mediterranee$Year == 2019] <- 
  N_P_rhone_mediterranee$Pt_out[N_P_rhone_mediterranee$code_WWTP == "060930037002" & N_P_rhone_mediterranee$Year == 2019]/10^3
N_P_rhone_mediterranee$Pt_out[N_P_rhone_mediterranee$code_WWTP == "060930037002" & N_P_rhone_mediterranee$Year == 2020] <- 
  N_P_rhone_mediterranee$Pt_out[N_P_rhone_mediterranee$code_WWTP == "060930037002" & N_P_rhone_mediterranee$Year == 2020]/10^3
#Scientrier
N_P_rhone_mediterranee$Pt_out[N_P_rhone_mediterranee$code_WWTP == "060974220001" & N_P_rhone_mediterranee$Year == 2013] <- 
  N_P_rhone_mediterranee$Pt_out[N_P_rhone_mediterranee$code_WWTP == "060974220001" & N_P_rhone_mediterranee$Year == 2013]/10
N_P_rhone_mediterranee$Pt_out[N_P_rhone_mediterranee$code_WWTP == "060974220001" & N_P_rhone_mediterranee$Year == 2014] <- 
  N_P_rhone_mediterranee$Pt_out[N_P_rhone_mediterranee$code_WWTP == "060974220001" & N_P_rhone_mediterranee$Year == 2014]/10^2

We recompute the values after the outliers corrections.

Code

N_P_rhone_mediterranee <-
  N_P_rhone_mediterranee %>%
  mutate(
    #Computes yields (even though sometimes yield is already reported, but we compare the consistency)
    Pt_yield = (1-Pt_out/Pt_in)*100, #to compare with reported P yield
    NGL_yield = (1-NGL_out/NGL_in)*100,
    DBO5_yield =(1-DBO5_out/DBO5_in)*100, 
    DCO_yield =(1-DCO_out/DCO_in)*100,
    MES_yield =(1-MES_out/MES_in)*100,
    #nutrient ratios
    N_P_ratio_in = NGL_in/Pt_in, 
    N_P_ratio_out = NGL_out/Pt_out,
    DCO_DBO5_ratio_in = DCO_in/DBO5_in,
    DCO_DBO5_ratio_out = DCO_out/DBO5_out,
    DBO5_N_ratio_in = DBO5_in/NGL_in,
    DBO5_N_ratio_out = DBO5_out/NGL_out,
    DBO5_P_ratio_in = DBO5_in/Pt_in,
    DBO5_P_ratio_out = DBO5_out/Pt_out
  )

#recompute basin values (flows, yields, ratios..)
basin_N_P_rhone_mediterranee <- f_basin_flows(N_P_rhone_mediterranee)
basin_N_P_rhone_mediterranee <- f_all_nutrient_ratios_basin(basin_N_P_rhone_mediterranee, N_P_rhone_mediterranee)
basin_N_P_rhone_mediterranee <- f_all_yields_basin(basin_N_P_rhone_mediterranee, N_P_rhone_mediterranee)
basin_N_P_rhone_mediterranee <- f_year_categories(basin_N_P_rhone_mediterranee)

#recompute basin x PE values (flows, yields, ratios..)
basin_PE_N_P_rhone_mediterranee <- f_basin_PE_flows(N_P_rhone_mediterranee)
basin_PE_N_P_rhone_mediterranee <- f_all_nutrient_ratios_basin_PE(basin_PE_N_P_rhone_mediterranee, N_P_rhone_mediterranee)
basin_PE_N_P_rhone_mediterranee <- f_all_yields_basin_PE(basin_PE_N_P_rhone_mediterranee, N_P_rhone_mediterranee)
basin_PE_N_P_rhone_mediterranee <- f_year_categories(basin_PE_N_P_rhone_mediterranee)


#we introduce empty values for unreported flows
#flows basin
basin_N_P_rhone_mediterranee$NGL_out[basin_N_P_rhone_mediterranee$Year < 2009] <- NA
basin_N_P_rhone_mediterranee$NTK_out[basin_N_P_rhone_mediterranee$Year > 2007] <- NA
basin_N_P_rhone_mediterranee$DCO_in[basin_N_P_rhone_mediterranee$Year < 2009] <- NA
basin_N_P_rhone_mediterranee$DCO_out[basin_N_P_rhone_mediterranee$Year < 2009] <- NA
#flows basin x PE
basin_PE_N_P_rhone_mediterranee$NGL_out[basin_PE_N_P_rhone_mediterranee$Year < 2009] <- NA
basin_PE_N_P_rhone_mediterranee$NTK_out[basin_PE_N_P_rhone_mediterranee$Year > 2007] <- NA
basin_PE_N_P_rhone_mediterranee$DCO_in[basin_PE_N_P_rhone_mediterranee$Year < 2009] <- NA
basin_PE_N_P_rhone_mediterranee$DCO_out[basin_PE_N_P_rhone_mediterranee$Year < 2009] <- NA

Capacities distribution

Code

temp <- N_P_rhone_mediterranee %>%
  group_by(Year) %>%
  summarise(
    capacity = sum(capacity, na.rm = T)/10^6, #capacity in million PE
    nb_WWTP = n()
  )

Code

coef <- max(temp$capacity)/max(temp$nb_WWTP)
ggplot(temp) +
  geom_line(
    aes(
      Year, nb_WWTP, 
      color = "number of reported facilities (left)"
      )
    ) + 
  geom_line(
    aes(
      Year, capacity/coef, 
      color = "total reported capacity (right)"
      )
    ) + 
  scale_y_continuous(
    limits = c(0, NA),
    sec.axis = 
      sec_axis(
        trans=~.*coef, 
        name="million Population Equivalent"
        )
    ) +
  labs(
    title = "Evolution of the reporting in the database",
    subtitle = "in terms of number of WWTP reported and total reported capacity",
    y="", x="", 
    color="", caption =Source
  ) +
  theme(
    legend.position = c(0.7, 0.3)
  )

Code

temp <- N_P_rhone_mediterranee %>%
  filter(is.na(capacity)==F) %>%
  select(Year, capacity, PE_bin) %>%
  group_by(Year, PE_bin) %>%
  summarise(
    `capacity (million PE)` = sum(capacity)/10^6,
    `number of stations` = n()
  ) %>% 
  gather(key=capacity_or_n, value = value, `capacity (million PE)`, `number of stations`)

Absolute
Relative

Code

ggplot(temp) + 
  geom_area(aes(Year, value, fill=PE_bin)) + 
  facet_wrap(vars(capacity_or_n), scales="free") + 
  labs(
    title="Reporting in the database",
    subtitle = "For each capacity category",
    x="", y="", fill="nominal capacity \n(Population Equivalent)",
    caption = Source
  )

Code

ggplot(temp) + 
  geom_area(aes(Year, value, fill=PE_bin), position = "fill") + 
  facet_wrap(vars(capacity_or_n), scales="free") + 
  labs(
    title="Reporting in the database",
    subtitle = "Proportion of each capacity category",
    x="", y="", fill="nominal capacity \n(Population Equivalent)",
    caption = Source
  )

Code

temp <- N_P_rhone_mediterranee %>% filter(Year==Year_analysis)
ggplot(temp) + 
  geom_histogram(
    aes(
      capacity, 
      fill = "Nb of facilities"
      ), 
    n=100, alpha=.4, stat="density"
    ) +
  geom_histogram(
    aes(
      capacity, weight = capacity, 
      fill="Nb of facilities weighted by capacity"
      ), 
    n=100, alpha=.4, stat="density"
    ) +
  theme(
    legend.position = c(0.7,0.8),
  ) +
  labs(
    x="Waste Water Treatment Plant Capacity \n(Population Equivalent)",
    y="Distribution density",
    fill="Distribution of",
    title = paste("WWTP capacities distribution", as.character(Year_analysis)),
    subtitle = "raw of weighted by capacity"
  ) +
  scale_x_log10(
    labels = scales::label_number(drop0trailing = TRUE)
    )

Code

temp <- N_P_rhone_mediterranee %>% 
  ungroup() %>%
  filter(Year==Year_analysis) %>% 
  select(code_WWTP, name_WWTP, capacity) %>%
  filter(is.na(capacity) == F) %>%
  arrange(desc(capacity)) %>%
  mutate( 
    cumulative_capacity = cumsum(capacity)/10^6,
    rank_STEU = rank(-capacity, ties.method = "first"),
    percent_cumulative_capacity = round(cumulative_capacity/sum(capacity/10^6)*100, digits = 1),
    percent_rank = round(rank_STEU/n()*100, digits = 1)
    ) 

f_save_csv_files(
  temp %>% mutate(basin = "Rhône-Méditerranée"), 
  "output_data/zipf_law/",
  "zipf_law_06_rhone_mediterranee.csv"
)

coef <- max(temp$rank_STEU)/100
coef2 <- max(temp$cumulative_capacity)/100

Code

ggplot(temp) +
  geom_step(
    aes(
      x = percent_rank, y = percent_cumulative_capacity
      )
    ) + 
  labs(
    title = paste("Cumulative distribution,", Year_analysis),
    subtitle="nb of WWTP vs total capacity",
    x="% of WWTP", y="% of total capacity",
    caption = Source
  ) +
  scale_x_continuous(
    sec.axis = 
      sec_axis(
        trans=~.*coef, name="nb of WWTP",
        labels = scales::label_number(drop0trailing = TRUE)
        )
    ) + 
  scale_y_continuous(
    sec.axis = 
      sec_axis(
        trans=~.*coef2, 
        name="cumulative capacity \n(millions PE)"
        )
    ) + 
    theme(legend.position = "none")

Code

ggplot(temp) +
  geom_step(
    aes(
      x = percent_rank, y = percent_cumulative_capacity
      )
    ) + 
  labs(
    title = paste("Cumulative distribution,", Year_analysis),
    subtitle="nb of WWTP vs total capacity",
    x="% of WWTP", y="% of total capacity",
    caption = Source
  ) +
  scale_x_log10(
    labels = scales::label_number(drop0trailing = TRUE),
    sec.axis = 
      sec_axis(
        trans=~.*coef, 
        name="nb of WWTP",
        labels = scales::label_number(drop0trailing = TRUE)
        )
    ) + 
  scale_y_continuous(
    sec.axis = 
      sec_axis(
        trans=~.*coef2, 
        name="cumulative capacity \n(millions Population Equivalent)"
        )
    ) + 
    theme(legend.position = "none")

Code

ggplot(temp) +
  geom_point(
    aes(
      x = rank_STEU, y = capacity
      )
    ) + 
  labs(
    title = paste("WWTP capacity vs rank, 2014,", Year_analysis),
    subtitle = "looking for a Zipf law",
    x="Waste Water Treatment Plant \n(ranked by capacity)",
    y=" Waste Water Treatment Plant capacity\n(Population Equivalent)"
    ) +
  scale_x_log10(
    labels = scales::label_number(drop0trailing = TRUE)
    ) + 
  scale_y_log10(
    labels = scales::label_number(drop0trailing = TRUE)
    )

Pollution flows

Code

#function for plots : to be finished
f_graph_reporting_nutrients <- function(pollution_in, pollution_out){
  temp <- N_P_rhone_mediterranee %>%
    select(
      Year, capacity, 
      !!as.symbol(pollution_in), !!as.symbol(pollution_out)
      ) %>%
    mutate(
      nutrient_in = is.na(!!as.symbol(pollution_in))==F,
      nutrient_out = is.na(!!as.symbol(pollution_out))==F
      ) %>%
    gather(
      key=in_out_flow, 
      value = `reported pollution`, 
      nutrient_in, nutrient_out
      ) %>%
    group_by(
      Year, in_out_flow, `reported pollution`
      ) %>%
    summarise(
      `number of WWTP`=n(), 
      `capacity (million PE)` = sum(capacity, na.rm=T)/10^6
      ) %>%
    gather(
      key=n_or_capacity, 
      value = value, 
      `number of WWTP`, `capacity (million PE)`
      ) %>%
    #renaming labels
    mutate(
      in_out_flow = case_when(
        in_out_flow == "nutrient_in" ~ pollution_in,
        in_out_flow == "nutrient_out" ~ pollution_out,
      )
    )

  g <- ggplot(temp) +
    geom_area(aes(Year, value, fill=`reported pollution`)) +
    facet_grid(
      n_or_capacity~in_out_flow, 
      scales="free_y", switch = "y") +
    labs(
      y="", x="",
      title = "Reporting of nutrient inflows (left) and outflows (right)",
      subtitle = "In terms of total capacity (top) and nb of WWTP (bottom)",
      caption = Source
      ) 

  return(g)
}

Code

f_graph_reporting_nutrients("NGL_in", "NGL_out")

Code

f_graph_reporting_nutrients("NTK_in", "NTK_out")

Code

f_graph_reporting_nutrients("Pt_in", "Pt_out")

Code

f_graph_reporting_nutrients("DBO5_in", "DBO5_out")

Code

f_graph_reporting_nutrients("DCO_in", "DCO_out")

Code

f_graph_reporting_nutrients("MES_in", "MES_out")

Code

#changing the graph function to change the subtitle (before data cleaning => after data cleaning)
f_graph_nutrient <- function(dataset, nutrient_in, nutrient_out, label, legend_x, legend_y){
  p <- ggplot(dataset) + 
    #nutrient inflow
    geom_line(
      aes(
        Year, 
        !!as.symbol(nutrient_in), 
        color=nutrient_in
        )
      ) + 
    #nutrient outflow
    geom_line(
      aes(
        Year,
        !!as.symbol(nutrient_out), 
        color = nutrient_out
        )
      ) +
    ylim(0, NA) +
    theme(
      legend.position = c(legend_x, legend_y), 
      legend.title = element_blank()
      ) +
    labs(
      x="", y=paste("kt of", label) , 
      title = paste("Reported", label, "flows in Rhône-Méditerranée WWTPs") ,
      subtitle = "reported, not necessarily actual ; here after data cleaning", 
      caption = Source
      ) 
  return(p)
}

Code

f_graph_nutrient(basin_N_P_rhone_mediterranee, "NGL_in", "NGL_out", "NGL", 0.7, 0.5)

Code

f_graph_nutrient(basin_N_P_rhone_mediterranee, "NTK_in", "NTK_out", "NTK", 0.7, 0.5)

Code

f_graph_nutrient(basin_N_P_rhone_mediterranee, "Pt_in", "Pt_out", "Pt", 0.4, 0.5)

Code

f_graph_nutrient(basin_N_P_rhone_mediterranee, "DBO5_in", "DBO5_out", "DBO5", 0.4, 0.5)

Code

f_graph_nutrient(basin_N_P_rhone_mediterranee, "DCO_in", "DCO_out", "DCO", 0.4, 0.5)

Code

f_graph_nutrient(basin_N_P_rhone_mediterranee, "MES_in", "MES_out", "MES", 0.4, 0.5)

We compute in terms of installed capacity the reported and unreported flows for NGL, Pt, DBO5, DCO and MES. We do this for each year and for each capacity category.

Code

#create file of reported 
temp <- N_P_rhone_mediterranee %>%
  select(
    Year, PE_bin, capacity, 
    Pt_in, Pt_out, NGL_in, NGL_out, DBO5_in, DBO5_out, DCO_in, DCO_out, MES_in, MES_out
    ) %>%
  #spots unreported values for each nutrient flow
  mutate(
    across(
      c(Pt_in, Pt_out, NGL_in, NGL_out, DBO5_in, DBO5_out, DCO_in, DCO_out, MES_in, MES_out),
      ~ is.na(.x)==F
      )
    ) %>%
  #gather to ba able to then group by flow and count capacity
  gather(
    key=nutrient_flow, 
    value = reported_pollution, 
    Pt_in, Pt_out, NGL_in, NGL_out, DBO5_in, DBO5_out, DCO_in, DCO_out, MES_in, MES_out
    )  %>%
  #count reported capacity and unreported capacity for each (Year, capacity category, nutrient flow)
  group_by(
    Year, PE_bin, nutrient_flow, reported_pollution
    ) %>%
  summarise(
    capacity = sum(capacity, na.rm=T)/10^6
    ) %>%
  #creates reported/unreported names for each nutrient flow and spreads into columns
  mutate(
    nutrient_flow = case_when(
      reported_pollution == T ~ paste0(nutrient_flow, "_reported"),
      reported_pollution == F ~ paste0(nutrient_flow, "_unreported")
    )
  ) %>%
  select(-reported_pollution) %>%
  spread(nutrient_flow, capacity)

# NA values replaced by 0 for future coeff computation
temp[is.na(temp)] <- 0

From this we compute proportionate coefficient to extrapolate real flows.

Code

temp <- temp %>%
  mutate(
    #Pt
    Pt_in_coeff = (Pt_in_reported + Pt_in_unreported)/Pt_in_reported,
    Pt_out_coeff = (Pt_out_reported + Pt_out_unreported)/Pt_out_reported,
    #NGL
    NGL_in_coeff = (NGL_in_reported + NGL_in_unreported)/NGL_in_reported,
    NGL_out_coeff = (NGL_out_reported + NGL_out_unreported)/NGL_out_reported,
    #DBO5
    DBO5_in_coeff = (DBO5_in_reported + DBO5_in_unreported)/DBO5_in_reported,
    DBO5_out_coeff = (DBO5_out_reported + DBO5_out_unreported)/DBO5_out_reported,
    #DCO
    DCO_in_coeff = (DCO_in_reported + DCO_in_unreported)/DCO_in_reported,
    DCO_out_coeff = (DCO_out_reported + DCO_out_unreported)/DCO_out_reported,
    #MES
    MES_in_coeff = (MES_in_reported + MES_in_unreported)/MES_in_reported,
    MES_out_coeff = (MES_out_reported + MES_out_unreported)/MES_out_reported,
  ) %>%
  select(
    -c(
      Pt_in_reported, Pt_in_unreported,
      Pt_out_reported, Pt_out_unreported,
      NGL_in_reported, NGL_in_unreported,
      NGL_out_reported, NGL_out_unreported,
      DBO5_in_reported, DBO5_in_unreported,
      DBO5_out_reported, DBO5_out_unreported,
      DCO_in_reported, DCO_in_unreported, 
      DCO_out_reported, DCO_out_unreported, 
      MES_in_reported, MES_in_unreported, 
      MES_out_reported, MES_out_unreported
      )
    )

#replace Inf values by 1 (when divided by 0)
temp[temp == Inf] <- 1
#replace NAN values by 1 (case of the unerported capacities)
temp[is.na(temp)] <- 1

With these coefficients we compute the adjusted flows

Code

#file with reported flows and adjustment coefficient
temp2 <- left_join(
  basin_PE_N_P_rhone_mediterranee %>%
    #selects only flows and not yields or ratios
    select(
      Year, PE_bin, 
      Pt_in, Pt_out, NGL_in, NGL_out, DBO5_in, DBO5_out, DCO_in, DCO_out, MES_in, MES_out
    ),
  temp, by=c("Year", "PE_bin")
)

#computes adjusted flows
temp2 <- temp2 %>%
  mutate(
    #Pt
    Pt_in_adj = round(Pt_in_coeff*Pt_in, 5),
    Pt_out_adj = round(Pt_out_coeff*Pt_out, 5),
    #NGL
    NGL_in_adj = round(NGL_in_coeff*NGL_in, 5),
    NGL_out_adj = round(NGL_out_coeff*NGL_out, 5),
    #DBO5
    DBO5_in_adj = round(DBO5_in_coeff*DBO5_in, 5),
    DBO5_out_adj = round(DBO5_out_coeff*DBO5_out, 5),
    #DCO
    DCO_in_adj = round(DCO_in_coeff*DCO_in, 5),
    DCO_out_adj = round(DCO_out_coeff*DCO_out, 5),
    #MES
    MES_in_adj = round(MES_in_coeff*MES_in, 5),
    MES_out_adj = round(MES_out_coeff*MES_out, 5)
  ) %>%
  #we remove coefficients and unajusted flows
  select(
    -c(
      Pt_in, Pt_in_coeff, Pt_out, Pt_out_coeff,
      NGL_in, NGL_in_coeff, NGL_out, NGL_out_coeff,
      DBO5_in, DBO5_in_coeff, DBO5_out, DBO5_out_coeff,
      DCO_in, DCO_in_coeff, DCO_out, DCO_out_coeff, 
      MES_in, MES_in_coeff, MES_out, MES_out_coeff
    )
  )

We add these adjusted flows to the main files reporting flows at the basin scale

Code

#adding adjusted flows to the basin x capacity files
basin_PE_N_P_rhone_mediterranee <- left_join(
  basin_PE_N_P_rhone_mediterranee, temp2, by=c("Year", "PE_bin")
)

#aggregating adjusted flows at the basin scale without the capacity categories
temp <- temp2 %>%
  select(-PE_bin) %>%
  group_by(Year) %>%
  summarise_all(~signif(sum(.x), 3))

#adding adjusted flows to the basin files
basin_N_P_rhone_mediterranee <- left_join(
  basin_N_P_rhone_mediterranee, temp, by="Year"
)

We plot the comparison reported / adjusted in the following graphs.

Code

f_graph_adjusted <- function(basin_file, basin_PE_file, nutrient_adjusted, nutrient_reported, nutrient_label){
  g <- plot_grid(
    ggplot(basin_PE_file) +
      geom_line(
        data = basin_file,
        aes(Year, !!as.symbol(nutrient_adjusted)), 
        color="black", size=1
        ) + 
      geom_area(
        aes(Year, !!as.symbol(nutrient_reported), fill=PE_bin), 
        alpha=.7
        ) + 
      theme(legend.position = "none") +
      labs(
        x="", y="kt per year",
        caption = "\n",
        title = paste("Adjusted", nutrient_label, "flows")
    ),
    ggplot(basin_PE_file) +
      geom_line(
        aes(Year, !!as.symbol(nutrient_adjusted), color=PE_bin), 
        size=1
        ) + 
      geom_area(
        aes(Year, !!as.symbol(nutrient_reported), fill=PE_bin), 
        alpha=.7
        ) + 
      theme(legend.position = "none") +
      facet_wrap(vars(PE_bin), scales="free") +
      labs(
        x="", y="",
        caption = Source,
        title = "",
        subtitle = "line: adjusted flow ; area: reported flow"
      ),
    rel_widths = c(0.3, 0.7)
  )
  return(g)
}

Code

f_graph_adjusted(
  basin_N_P_rhone_mediterranee, 
  basin_PE_N_P_rhone_mediterranee,
  "Pt_in_adj", "Pt_in", "incoming Pt"
  )

Code

f_graph_adjusted(
  basin_N_P_rhone_mediterranee, 
  basin_PE_N_P_rhone_mediterranee,
  "Pt_out_adj", "Pt_out", "discharged Pt"
  )

Code

f_graph_adjusted(
  basin_N_P_rhone_mediterranee, 
  basin_PE_N_P_rhone_mediterranee,
  "NGL_in_adj", "NGL_in", "incoming NGL"
  )

Code

f_graph_adjusted(
  basin_N_P_rhone_mediterranee, 
  basin_PE_N_P_rhone_mediterranee,
  "NGL_out_adj", "NGL_out", "discharged NGL"
  )

Code

f_graph_adjusted(
  basin_N_P_rhone_mediterranee, 
  basin_PE_N_P_rhone_mediterranee,
  "DBO5_in_adj", "DBO5_in", "incoming DBO5"
  )

Code

f_graph_adjusted(
  basin_N_P_rhone_mediterranee, 
  basin_PE_N_P_rhone_mediterranee,
  "DBO5_out_adj", "DBO5_out", "discharged DBO5"
  )

Code

f_graph_adjusted(
  basin_N_P_rhone_mediterranee, 
  basin_PE_N_P_rhone_mediterranee,
  "DCO_in_adj", "DCO_in", "incoming DCO"
  )

Code

f_graph_adjusted(
  basin_N_P_rhone_mediterranee, 
  basin_PE_N_P_rhone_mediterranee,
  "DCO_out_adj", "DCO_out", "discharged DCO"
  )

Code

f_graph_adjusted(
  basin_N_P_rhone_mediterranee, 
  basin_PE_N_P_rhone_mediterranee,
  "MES_in_adj", "MES_in", "incoming MES"
  )

Code

f_graph_adjusted(
  basin_N_P_rhone_mediterranee, 
  basin_PE_N_P_rhone_mediterranee,
  "MES_out_adj", "MES_out", "discharged MES"
  )

Ratios

Basin scale

Code

#temporal P/N ratio
ggplot(basin_N_P_rhone_mediterranee) + 
  geom_line(aes(Year, N_P_ratio_in, color="N:P in")) + 
  geom_line(aes(Year, N_P_ratio_out, color = "N:P out")) + 
  ylim(0, NA) +
  theme(
    legend.position = c(0.7, 0.6)
  ) +
  labs(
    x="", y="N:P ratio",
    title = "N:P ratio in Rhône-Méditerranéee basin",
    subtitle = "increase over time reflect phosphate detergent ban",
    caption=Source, color=""
  )

Code

ggplot(basin_N_P_rhone_mediterranee) + 
  geom_line(aes(Year, DCO_in/DBO5_in, color="DCO:DBO5 in")) + 
  geom_line(aes(Year, DCO_out/DBO5_out, color = "DCO:DBO5 out")) + 
  ylim(0, NA) +
  theme(
    legend.position = c(0.7, 0.6)
  ) +
  labs(
    x="", y="DCO:DBO5 ratio",
    title = "DCO:DBO5 ratio in Rhône-Méditerranéee basin",
    subtitle = "decrease in outflow shows biodegradation",
    caption=Source, color=""
  )

Code

ggplot(basin_N_P_rhone_mediterranee) + 
  geom_point(
    aes(
      DBO5_in/NGL_in, DBO5_in/Pt_in, 
      color=Year_category
      )
    ) +
  geom_point(
    aes(
      DBO5_out/NGL_out, DBO5_out/Pt_out, 
      color=Year_category
        )
    ) +
  ylim(0, NA) +
  annotate(
    geom="text", label ="inflow",
    x=4, y=25
  ) +
  annotate(
    geom="text", label ="outflow",
    x=1.3, y=8
  ) +
  labs(
    x="DBO5:Pt ratio", y="DBO5:NGL ratio",
    title = "DBO5:NGL vs DBO5:Pt ratio in Rhône-Méditerranéee basin WWTPs",
    subtitle = "decrease in outflow shows biodegradation",
    caption=Source, color=""
  )

Basin yield

Basin
Basin x Capacity

Code

ggplot(basin_N_P_rhone_mediterranee) + 
  geom_line(aes(Year, Pt_yield, color="P")) + 
  geom_line(aes(Year, NGL_yield, color = "N")) + 
  geom_line(aes(Year, DBO5_yield, color = "DBO5")) +
  geom_line(aes(Year, DCO_yield, color = "DCO")) +
  geom_line(aes(Year, MES_yield, color = "MES")) +
  ylim(0,100) +
  theme(legend.position = c(0.7, 0.3)) +
  labs(
    title = "Global abatement rate of Rhône-Méditerranée WWTPs", 
    x="", y="Yield (%)", color="",
    caption = Source
    )

Code

f_graph_yield_PE <- function(dataset, nutrient_yield, nutrient_label){
  g <- ggplot(dataset %>% filter(PE_bin !="unreported PE")) + 
    geom_line(aes(Year, !!as.symbol(nutrient_yield), color=PE_bin)) + 
    ylim(0,100) +
    theme(legend.position = c(0.7, 0.3)) +
    labs(
      title = paste("Global", nutrient_label, "yield of Rhône-Méditerranée basin"), 
      subtitle = "by capacity (population equivalent)",
      x="", y="Yield (%)", color="",
      caption = Source
      )
  return(g)
}

Code

f_graph_yield_PE(
  basin_PE_N_P_rhone_mediterranee %>% filter(PE_bin!="0 - 200 PE"), 
  "Pt_yield", "Pt")

Code

f_graph_yield_PE(
  basin_PE_N_P_rhone_mediterranee %>% filter(PE_bin!="0 - 200 PE"), 
  "NGL_yield", "NGL")

Code

f_graph_yield_PE(
  basin_PE_N_P_rhone_mediterranee %>% filter(PE_bin!="0 - 200 PE"), 
  "DBO5_yield", "DBO5")

Code

f_graph_yield_PE(
  basin_PE_N_P_rhone_mediterranee %>% filter(PE_bin!="0 - 200 PE"), 
  "DCO_yield", "DCO")

Code

f_graph_yield_PE(
  basin_PE_N_P_rhone_mediterranee %>% filter(PE_bin!="0 - 200 PE"), 
  "MES_yield", "MES")

Save data

We save the data. We consider all the absolute flows to be unreliable before 2009, and so we do not save them. We also remove the ratios nutrient_flow:capacities.

Before 2009, we also all data that include outgoing DBO5 to be unreliable. This includes:

outgoing DBO5:Pt ratio
DBO5 yield
(not need for DBO5:DCO and DBO5:NGL since outgoing NGL and DCO are not reported before 2009)

Code

#for N_P_rhone_mediterranee file
temp <- N_P_rhone_mediterranee %>%
  filter(Year<2009) %>%
  mutate(
    across(
      c(
        DBO5_in, DCO_in, MES_in, NGL_in, NTK_in, Pt_in,
        DBO5_out, DCO_out, MES_out, NGL_out, NTK_out, Pt_out,
        DBO5_yield
      ),~NA
    )
  )
N_P_rhone_mediterranee <- bind_rows(
  temp,
  N_P_rhone_mediterranee %>% filter(Year >= 2009)
)

#for basin_N_P_rhone_mediterranee file
temp <- basin_N_P_rhone_mediterranee %>%
  filter(Year<2009) %>%
  mutate(
    across(
      c(
        #in and out flows (adjusted and unadjusted)
        Pt_in, Pt_in_adj, NGL_in, NGL_in_adj, NTK_in,
        DBO5_in, DBO5_in_adj, DCO_in, DCO_in_adj, MES_in, MES_in_adj,
        Pt_out, Pt_out_adj, NGL_out, NGL_out_adj, NTK_out,
        DBO5_out, DBO5_out_adj, DCO_out, DCO_out_adj, MES_out, MES_out_adj,
        #PE ratios
        Pt_PE_ratio_in, Pt_PE_ratio_out, NGL_PE_ratio_in, NGL_PE_ratio_out, 
        DBO5_PE_ratio_in, DBO5_PE_ratio_out, DCO_PE_ratio_in, DCO_PE_ratio_out, MES_PE_ratio_in, MES_PE_ratio_out,
        #flows with outgoing DBO5
        DBO5_P_ratio_out, DBO5_yield
      ),~NA
    )
  )
basin_N_P_rhone_mediterranee <- bind_rows(
  temp,
  basin_N_P_rhone_mediterranee %>% filter(Year >= 2009)
)

#for basin_PE_N_P_rhone_mediterranee file
temp <- basin_PE_N_P_rhone_mediterranee %>%
  filter(Year<2009) %>%
  mutate(
    across(
      c(
        #in and out flows (adjusted and unadjusted)
        Pt_in, Pt_in_adj, NGL_in, NGL_in_adj, NTK_in,
        DBO5_in, DBO5_in_adj, DCO_in, DCO_in_adj, MES_in, MES_in_adj,
        Pt_out, Pt_out_adj, NGL_out, NGL_out_adj, NTK_out,
        DBO5_out, DBO5_out_adj, DCO_out, DCO_out_adj, MES_out, MES_out_adj,
        #PE ratios
        Pt_PE_ratio_in, Pt_PE_ratio_out, NGL_PE_ratio_in, NGL_PE_ratio_out, 
        DBO5_PE_ratio_in, DBO5_PE_ratio_out, DCO_PE_ratio_in, DCO_PE_ratio_out, MES_PE_ratio_in, MES_PE_ratio_out,
        #nutrient ratio
        DBO5_P_ratio_out, DBO5_yield
      ),~NA
    )
  )
basin_PE_N_P_rhone_mediterranee <- bind_rows(
  temp,
  basin_PE_N_P_rhone_mediterranee %>% filter(Year >= 2009)
)

Code

#all WWTP file
path_output <- "output_data/all_WWTP/"
temp <- N_P_rhone_mediterranee %>%
  select(
    code_WWTP, name_WWTP, Year, capacity, name_commune, INSEE_COM, lat_WWTP, long_WWTP, PE_bin,
    DBO5_in, DCO_in, MES_in, NGL_in, NTK_in, Pt_in,
    DBO5_out, DCO_out, MES_out, NGL_out, NTK_out, Pt_out,
    DBO5_yield, DCO_yield, MES_yield, NGL_yield, Pt_yield,
  )
f_save_csv_files(
  temp,
  path_output,
  "all_WWTP_06_rhone_mediterranee.csv"
)

#basin agregated file
path_output <- "output_data/basins/"
temp <- basin_N_P_rhone_mediterranee %>%
  select(
    Year, capacity, nb_WWTP,
    #flows reported and adjusted
    Pt_in, Pt_in_adj, NGL_in, NGL_in_adj, 
    DBO5_in, DBO5_in_adj, DCO_in, DCO_in_adj, MES_in, MES_in_adj,
    Pt_out, Pt_out_adj, NGL_out, NGL_out_adj,
    DBO5_out, DBO5_out_adj, DCO_out, DCO_out_adj, MES_out, MES_out_adj,
    #yields
    NGL_yield, Pt_yield, DBO5_yield, DCO_yield, MES_yield, 
    #nutrient ratios
    N_P_ratio_in, N_P_ratio_out,
    DBO5_N_ratio_in, DBO5_N_ratio_out, DBO5_P_ratio_in, DBO5_P_ratio_out,
    DCO_N_ratio_in, DCO_N_ratio_out, DCO_P_ratio_in, DCO_P_ratio_out,
    DCO_DBO5_ratio_in, DCO_DBO5_ratio_out,
    #PE ratios
    Pt_PE_ratio_in, Pt_PE_ratio_out, NGL_PE_ratio_in, NGL_PE_ratio_out, 
    DBO5_PE_ratio_in, DBO5_PE_ratio_out, DCO_PE_ratio_in, DCO_PE_ratio_out, MES_PE_ratio_in, MES_PE_ratio_out
  )
f_save_csv_files(
  temp,
  path_output,
  "basin_06_rhone_mediterranee.csv"
)

#basin x PE agregated file
path_output <- "output_data/basins_PE/"
temp <- basin_PE_N_P_rhone_mediterranee %>%
  select(
    Year, capacity, nb_WWTP, PE_bin, 
    #flows reported and adjusted
    Pt_in, Pt_in_adj, NGL_in, NGL_in_adj, 
    DBO5_in, DBO5_in_adj, DCO_in, DCO_in_adj, MES_in, MES_in_adj,
    Pt_out, Pt_out_adj, NGL_out, NGL_out_adj,
    DBO5_out, DBO5_out_adj, DCO_out, DCO_out_adj, MES_out, MES_out_adj,
    #yields
    NGL_yield, Pt_yield, DBO5_yield, DCO_yield, MES_yield, 
    #nutrient ratios
    N_P_ratio_in, N_P_ratio_out,
    DBO5_N_ratio_in, DBO5_N_ratio_out, DBO5_P_ratio_in, DBO5_P_ratio_out,
    DCO_N_ratio_in, DCO_N_ratio_out, DCO_P_ratio_in, DCO_P_ratio_out,
    DCO_DBO5_ratio_in, DCO_DBO5_ratio_out,
    #PE ratios
    Pt_PE_ratio_in, Pt_PE_ratio_out, NGL_PE_ratio_in, NGL_PE_ratio_out, 
    DBO5_PE_ratio_in, DBO5_PE_ratio_out, DCO_PE_ratio_in, DCO_PE_ratio_out, MES_PE_ratio_in, MES_PE_ratio_out
  )
f_save_csv_files(
  temp,
  path_output,
  "basin_PE_06_rhone_mediterranee.csv"
)

Code

rm(list = ls())