Artois-Picardie

CAREFUL: for now, for Artois-Picardie basin, we do not have NGL outflow after 2008. We extrapolate at the basin scale from NO out and a constant ratio. We will update with better data in the future.

Code
knitr::opts_chunk$set(warning=F, message=F, results=F, dev='svg')

library(tidyverse) #loads multiple packages (see https://tidyverse.tidyverse.org/)

#core tidyverse packages loaded:
# ggplot2, for data visualisation. https://ggplot2.tidyverse.org/
# dplyr, for data manipulation. https://dplyr.tidyverse.org/
# tidyr, for data tidying. https://tidyr.tidyverse.org/
# readr, for data import. https://readr.tidyverse.org/
# purrr, for functional programming. https://purrr.tidyverse.org/
# tibble, for tibbles, a modern re-imagining of data frames. https://tibble.tidyverse.org/
# stringr, for strings. https://stringr.tidyverse.org/
# forcats, for factors. https://forcats.tidyverse.org/
# lubridate, for date/times. https://lubridate.tidyverse.org/

#also loads the following packages (less frequently used):
# Working with specific types of vectors:
#     hms, for times. https://hms.tidyverse.org/
# Importing other types of data:
#     feather, for sharing with Python and other languages. https://github.com/wesm/feather
#     haven, for SPSS, SAS and Stata files. https://haven.tidyverse.org/
#     httr, for web apis. https://httr.r-lib.org/
#     jsonlite for JSON. https://arxiv.org/abs/1403.2805
#     readxl, for .xls and .xlsx files. https://readxl.tidyverse.org/
#     rvest, for web scraping. https://rvest.tidyverse.org/
#     xml2, for XML. https://xml2.r-lib.org/
# Modelling
#     modelr, for modelling within a pipeline. https://modelr.tidyverse.org/
#     broom, for turning models into tidy data. https://broom.tidymodels.org/

# Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors


#loading additional relevant packages
library(cowplot) #for plot_grid()

#setting graphs theme
theme_set(
  theme_minimal() +
    theme(plot.title = element_text(face="bold"))
  )

#setting viridis theme for colors
scale_colour_continuous <- scale_colour_viridis_c
scale_colour_discrete   <- scale_colour_viridis_d
scale_colour_binned     <- scale_colour_viridis_b
#setting viridis theme for fill
scale_fill_continuous <- scale_fill_viridis_c
scale_fill_discrete   <- scale_fill_viridis_d
scale_fill_binned     <- scale_fill_viridis_b

Source <- "Agence de l'Eau Artois-Picardie\nComputation by Thomas Starck"

path_source <- "source_data/01_artois_picardie/"

Year_analysis <- 2018

# Load the function file
source("functions.R")

Data quality and description

Data source can be downloaded on Artois-Picardie Water Agency website, here (in Données sur les usages de l’eau -> Assainissement). This gives the data for 1992-2018

In the excel file, the table STATIONS D’EPURATION URBAINE describes the waste water treatment plant characteristics: capacity in population equivalent, N and P treatment, treatment type, nominal NK and P inflow….

The table PERFORMANCES also reports the plant capacity and adds incoming and outgoing reduced and oxidized nitrogen (NR and NO) flows and phosphorus (P) flows. It also reports DBO5, DCO and MES. Starting in 2008, NO is not reported anymore. DBO5 and DCO are only reported starting 2008.

Information about the basin ca be found in the “Etat des lieux 2019” (status report).

There are 4.7 million inhabitants in the basin. 720 000 inhabitants are not connected to sewers and use Individual Autonomous Systems (IAS).

Information about the basin can also be found in the “Guide de l’eau” (water guide), here and here.

We load the PERFORMANCES and STATIONS D’EPURATION URBAINE sheets and merge them. We compute the yields for each WWTP.

Code
#main file
file_artois_picardie <- readxl::read_excel(paste(path_source, "stations_depuration_urbaine.xls", sep=""), sheet = "PERFORMANCES")

#select and rename columns of interest
N_P_artois_picardie <- file_artois_picardie %>%
  select(
    code_WWTP = `Code SANDRE station`,
    name_WWTP = `Nom de la station d'épuration urbaine`,
    Year = `Année d'activité`,
    capacity = `Capacité de la station d'épuration urbaine (en Eh)`,
    #MES
    MES_in_reported = `Assiette entrante en MeS (en kg/j)`,
    MES_out = `Assiette sortante en MeS (en kg/j)`,
    MES_removed = `Assiette enlevée en MeS (en kg/j)`,
    #DBO5
    DBO5_in_reported = `Assiette entrante en DBO5 (en kg/j)`,
    DBO5_out = `Assiette sortante en DBO5 (en kg/j)`,
    DBO5_removed = `Assiette enlevée en DBO5 (en kg/j)`,
    #DCO
    DCO_in_reported = `Assiette entrante en DCO (en kg/j)`,
    DCO_out = `Assiette sortante en DCO (en kg/j)`,
    DCO_removed = `Assiette enlevée en DCO (en kg/j)`,
    #Pt
    Pt_in_reported = `Assiette entrante en P (en kg/j)`,
    Pt_out = `Assiette sortante en P (en kg/j)`,
    Pt_removed = `Assiette enlevée en P (en kg/j)`,
    #NTK
    NTK_in_reported = `Assiette entrante en NR (en kg/j)`,
    NTK_out = `Assiette sortante en NR (en kg/j)`,
    NTK_removed = `Assiette enlevée en NR (en kg/j)`,
    #NO
    NO_in_reported = `Assiette entrante en NO (en kg/j)`,
    NO_out = `Assiette sortante en NO (en kg/j)`,
    NO_removed = `Assiette enlevée en NO (en kg/j)`
  )

# Some reported flows are negative or null. We replace them with empty values.
#Pt
N_P_artois_picardie$Pt_in_reported[N_P_artois_picardie$Pt_in_reported <= 0] <- NA
N_P_artois_picardie$Pt_out[N_P_artois_picardie$Pt_out <= 0] <- NA
N_P_artois_picardie$Pt_removed[N_P_artois_picardie$Pt_removed <= 0] <- NA
#DBO5
N_P_artois_picardie$DBO5_in_reported[N_P_artois_picardie$DBO5_in_reported <= 0] <- NA
N_P_artois_picardie$DBO5_out[N_P_artois_picardie$DBO5_out <= 0] <- NA
N_P_artois_picardie$DBO5_removed[N_P_artois_picardie$DBO5_removed <= 0] <- NA
#DCO
N_P_artois_picardie$DCO_in_reported[N_P_artois_picardie$DCO_in_reported <= 0] <- NA
N_P_artois_picardie$DCO_out[N_P_artois_picardie$DCO_out <= 0] <- NA
N_P_artois_picardie$DCO_removed[N_P_artois_picardie$DCO_removed <= 0] <- NA
#MES
N_P_artois_picardie$MES_in_reported[N_P_artois_picardie$MES_in_reported <= 0] <- NA
N_P_artois_picardie$MES_out[N_P_artois_picardie$MES_out <= 0] <- NA
N_P_artois_picardie$MES_removed[N_P_artois_picardie$MES_removed <= 0] <- NA
#NTK
N_P_artois_picardie$NTK_in_reported[N_P_artois_picardie$NTK_in_reported <= 0] <- NA
N_P_artois_picardie$NTK_out[N_P_artois_picardie$NTK_out <= 0] <- NA
N_P_artois_picardie$NTK_removed[N_P_artois_picardie$NTK_removed <= 0] <- NA
#NO
N_P_artois_picardie$NO_in_reported[N_P_artois_picardie$NO_in_reported <= 0] <- NA
N_P_artois_picardie$NO_out[N_P_artois_picardie$NO_out <= 0] <- NA
#for NO we cannot apply it to removed flow car NO can be created through nitrification, thus a <0 "removed" flow is possible

#estimation of NGL ; incoming nutrient flows ; nutrient yields
N_P_artois_picardie <- N_P_artois_picardie %>%
  rowwise() %>% #rowise to be able to remove empty values
  mutate(
    #NGL
    NGL_in_reported = NTK_in_reported, # we approximate NGL_in by NTK_in
    #we compute NGL outflow only if both NTK and NO are reported
    NGL_out = sum(NO_out + NTK_out, na.rm=!((is.na(NTK_out)|is.na(NO_out)))),
    #computed inflow (not possible to compute NO_in because NO output "created" during treatment)
    Pt_in = Pt_out + Pt_removed,
    NTK_in = NTK_out + NTK_removed,
    NGL_in = NTK_in,
    DCO_in = DCO_out + DCO_removed,
    DBO5_in = DBO5_out + DBO5_removed,
    MES_in = MES_out + MES_removed,
    #WWTP yield
    Pt_yield = (1-Pt_out/Pt_in)*100,
    NGL_yield = (1-NGL_out/NGL_in)*100,
    DCO_yield = (1-DCO_out/DCO_in)*100,
    DBO5_yield = (1-DBO5_out/DBO5_in)*100,
    MES_yield = (1-MES_out/MES_in)*100
  )
  
#Careful : starting 2008, NO is not reported anymore

#Loading the infos related to WWTP characteristics
file_WWTP <- readxl::read_excel(paste(path_source, "stations_depuration_urbaine.xls", sep=""), sheet = "STATIONS D'EPURATION URBAINE")
WWTP <- file_WWTP %>% 
  select(
    code_WWTP = `Code SANDRE station`,
    name_commune = `Nom commune`,
    treatment_type = `Type de station`,
    N_treatment = `Traitement azote ?`,
    P_treatment = `Traitement phosphore ?`,
    capacity_bis = `Capacité STEU (en EH)`,
    NTK_nominal_flow = `Flux nominal journalier en NK (en kg/j)`,
    P_nominal_flow = `Flux nominal journalier en P (en kg/j)`,
    INSEE_COM = `Code INSEE commune`,
    lat_WWTP = `Lambert93 X`,
    long_WWTP = `Lambert93 Y`
  )

#merging the 2 files
N_P_artois_picardie <- left_join(
  N_P_artois_picardie, WWTP, by = "code_WWTP"
)

#1991 is poorly reported, we do not keep it in the analysis
N_P_artois_picardie <- N_P_artois_picardie %>%
  filter(Year>1991)

#no double reporting (check that the file is empty) => ok
N_P_artois_picardie %>%
  select(Year, code_WWTP) %>%
  count(Year, code_WWTP) %>%
  filter(n !=1)

# check there is no unreported capacity : OK (returns empty table)
N_P_artois_picardie %>% filter(is.na(capacity)|capacity<=0)

We create the capacity categories in terms of population equivalent.

Code
#function to create capacity categories
f_PE_bin_categories <- function(dataset){
  #Creating size categories
  dataset <- 
    dataset %>%
    mutate(
      PE_bin = case_when(
        capacity < 200 ~ "0 - 200 PE",
        (capacity >= 200) & (capacity <2000) ~ "200 - 2 000 PE",
        (capacity >= 2000) & (capacity <10000) ~ "2 000 - 10 000 PE",
        (capacity >= 10000) & (capacity <100000) ~ "10 000 - 100 000 PE",
        capacity >= 100000 ~ "> 100 000 PE"
      )
    )
  
  #reorders treatment by their share of total capacity
  dataset$PE_bin <- 
    factor(
      dataset$PE_bin, 
      levels = 
        c("0 - 200 PE", 
          "200 - 2 000 PE", 
          "2 000 - 10 000 PE",
          "10 000 - 100 000 PE", 
          "> 100 000 PE"
          )
        )
  return(dataset)
}
N_P_artois_picardie <- f_PE_bin_categories(N_P_artois_picardie)

We create the file with aggregated flows at the basin scale, and also by capacity categories.

Code
f_basin_flows <- function(dataset){
  basin <- dataset %>%
    group_by(Year) %>%
    summarise(
      across(
        c(
          NGL_in, NTK_in, Pt_in, DBO5_in, DCO_in, MES_in,
          NGL_out, NTK_out, Pt_out, DBO5_out, DCO_out, MES_out, NO_out,
          NGL_in_reported, NTK_in_reported, Pt_in_reported, DBO5_in_reported, DCO_in_reported, MES_in_reported, NO_in_reported,
          NTK_removed, Pt_removed, DBO5_removed, DCO_removed, MES_removed, NO_removed
        ),
        ~signif(sum(.x, na.rm = T)*365/10^6, digits=3)
      ),
    #nb of waste water treatment plant
    nb_WWTP = n(),
    #capacity converted in million Population Equivalent
    capacity = signif(sum(capacity, na.rm = T)/10^6, digits=3),
    )
  return(basin)
}
basin_N_P_artois_picardie <- f_basin_flows(N_P_artois_picardie)

f_basin_PE_flows <- function(dataset){
  basin <- dataset %>%
    group_by(Year, PE_bin) %>%
    summarise(
      across(
        c(
          NGL_in, NTK_in, Pt_in, DBO5_in, DCO_in, MES_in,
          NGL_out, NTK_out, Pt_out, DBO5_out, DCO_out, MES_out, NO_out,
          NGL_in_reported, NTK_in_reported, Pt_in_reported, DBO5_in_reported, DCO_in_reported, MES_in_reported, NO_in_reported,
          NTK_removed, Pt_removed, DBO5_removed, DCO_removed, MES_removed, NO_removed
        ),
        ~signif(sum(.x, na.rm = T)*365/10^6, digits=3)
      ),
    #nb of waste water treatment plant
    nb_WWTP = n(),
    #capacity converted in million Population Equivalent
    capacity = signif(sum(capacity, na.rm = T)/10^6, digits=3),
    )
  return(basin)
}
basin_PE_N_P_artois_picardie <- f_basin_PE_flows(N_P_artois_picardie)

We compute the nutrient ratios at the basin scale, and also by capacity categories.

Code
#have to do this in case 1 nutrient is more reported than the other, which would create a bia if we took the ratio of the already aggregated flows
f_nutrient_ratio_basin <- function(basin, dataset, nutrient1, nutrient2){
  temp <- dataset %>% 
    filter(
      is.na(!!as.symbol(nutrient1))==F & is.na(!!as.symbol(nutrient2))==F
      ) %>%
    group_by(Year) %>%
    summarise(
      nutrient_1 = sum(!!as.symbol(nutrient1), na.rm=T),
      nutrient_2 = sum(!!as.symbol(nutrient2), na.rm=T),
      ratio = signif(nutrient_1/nutrient_2, digits=3)
    ) %>%
    select(-nutrient_1, -nutrient_2)
  basin <- left_join(
    basin, temp, by="Year"
  )
  return(basin)
}

f_nutrient_ratio_basin_PE <- function(basin_PE, dataset, nutrient1, nutrient2){
  temp <- dataset %>% 
    filter(
      is.na(!!as.symbol(nutrient1))==F & is.na(!!as.symbol(nutrient2))==F
      ) %>%
    group_by(Year, PE_bin) %>%
    summarise(
      nutrient_1 = sum(!!as.symbol(nutrient1), na.rm=T),
      nutrient_2 = sum(!!as.symbol(nutrient2), na.rm=T),
      ratio = signif(nutrient_1/nutrient_2, digits=3)
    ) %>%
    select(-nutrient_1, -nutrient_2)
  basin_PE <- left_join(
    basin_PE, temp, by=c("Year", "PE_bin")
  )
  return(basin_PE)
}

f_all_nutrient_ratios_basin <- function(basin, dataset){
  basin <- basin %>%
    
    #nutrient ratios
    #N_P in
    f_nutrient_ratio_basin(dataset, "NGL_in", "Pt_in") %>%
    rename(N_P_ratio_in = ratio) %>%
    #N_P out
    f_nutrient_ratio_basin(dataset, "NGL_out", "Pt_out") %>%
    rename(N_P_ratio_out = ratio) %>%
    #DCO_DBO5 in
    f_nutrient_ratio_basin(dataset, "DCO_in", "DBO5_in") %>%
    rename(DCO_DBO5_ratio_in = ratio) %>%
    #DCO_DBO5 out
    f_nutrient_ratio_basin(dataset, "DCO_out", "DBO5_out") %>%
    rename(DCO_DBO5_ratio_out = ratio) %>%
    #DBO5_N in
    f_nutrient_ratio_basin(dataset, "DBO5_in", "NGL_in") %>%
    rename(DBO5_N_ratio_in = ratio) %>%
    #DBO5_N out
    f_nutrient_ratio_basin(dataset, "DBO5_out", "NGL_out") %>%
    rename(DBO5_N_ratio_out = ratio) %>%
    #DBO5_P in
    f_nutrient_ratio_basin(dataset, "DBO5_in", "Pt_in") %>%
    rename(DBO5_P_ratio_in = ratio) %>%
    #DBO5_P out
    f_nutrient_ratio_basin(dataset, "DBO5_out", "Pt_out") %>%
    rename(DBO5_P_ratio_out = ratio) %>%
    #DCO_N in
    f_nutrient_ratio_basin(dataset, "DCO_in", "NGL_in") %>%
    rename(DCO_N_ratio_in = ratio) %>%
    #DCO_N out
    f_nutrient_ratio_basin(dataset, "DCO_out", "NGL_out") %>%
    rename(DCO_N_ratio_out = ratio) %>%
    #DCO_P in
    f_nutrient_ratio_basin(dataset, "DCO_in", "Pt_in") %>%
    rename(DCO_P_ratio_in = ratio) %>%
    #DCO_P out
    f_nutrient_ratio_basin(dataset, "DCO_out", "Pt_out") %>%
    rename(DCO_P_ratio_out = ratio) %>%
    
    #capacity ratios
    #Pt_PE_in
    f_nutrient_ratio_basin(dataset, "Pt_in", "capacity") %>%
    rename(Pt_PE_ratio_in = ratio) %>%
    #Pt_PE_out
    f_nutrient_ratio_basin(dataset, "Pt_out", "capacity") %>%
    rename(Pt_PE_ratio_out = ratio) %>%
    #NGL_PE_in
    f_nutrient_ratio_basin(dataset, "NGL_in", "capacity") %>%
    rename(NGL_PE_ratio_in = ratio) %>%
    #NGL_PE_out
    f_nutrient_ratio_basin(dataset, "NGL_out", "capacity") %>%
    rename(NGL_PE_ratio_out = ratio) %>%
    #DBO5_PE_in
    f_nutrient_ratio_basin(dataset, "DBO5_in", "capacity") %>%
    rename(DBO5_PE_ratio_in = ratio) %>%
    #DBO5_PE_out
    f_nutrient_ratio_basin(dataset, "DBO5_out", "capacity") %>%
    rename(DBO5_PE_ratio_out = ratio) %>%
    #DCO_PE_in
    f_nutrient_ratio_basin(dataset, "DCO_in", "capacity") %>%
    rename(DCO_PE_ratio_in = ratio) %>%
    #DCO_PE_out
    f_nutrient_ratio_basin(dataset, "DCO_out", "capacity") %>%
    rename(DCO_PE_ratio_out = ratio) %>%
    #MES_PE_in
    f_nutrient_ratio_basin(dataset, "MES_in", "capacity") %>%
    rename(MES_PE_ratio_in = ratio) %>%
    #MES_PE_out
    f_nutrient_ratio_basin(dataset, "MES_out", "capacity") %>%
    rename(MES_PE_ratio_out = ratio) %>%
    
    #convert from kg per PE per day to g per PE per day
    mutate(
      across(
        c(
          Pt_PE_ratio_in, Pt_PE_ratio_out, NGL_PE_ratio_in, NGL_PE_ratio_out, DBO5_PE_ratio_in, DBO5_PE_ratio_out,
          DCO_PE_ratio_in, DCO_PE_ratio_out, MES_PE_ratio_in, MES_PE_ratio_out
        ), ~.x*1000 
      )
    )
  return(basin)
}

f_all_nutrient_ratios_basin_PE <- function(basin_PE, dataset){
  basin_PE <- basin_PE %>%
    
    #nutrient ratios
    #N_P in
    f_nutrient_ratio_basin_PE(dataset, "NGL_in", "Pt_in") %>%
    rename(N_P_ratio_in = ratio) %>%
    #N_P out
    f_nutrient_ratio_basin_PE(dataset, "NGL_out", "Pt_out") %>%
    rename(N_P_ratio_out = ratio) %>%
    #DCO_DBO5 in
    f_nutrient_ratio_basin_PE(dataset, "DCO_in", "DBO5_in") %>%
    rename(DCO_DBO5_ratio_in = ratio) %>%
    #DCO_DBO5 out
    f_nutrient_ratio_basin_PE(dataset, "DCO_out", "DBO5_out") %>%
    rename(DCO_DBO5_ratio_out = ratio) %>%
    #DBO5_N in
    f_nutrient_ratio_basin_PE(dataset, "DBO5_in", "NGL_in") %>%
    rename(DBO5_N_ratio_in = ratio) %>%
    #DBO5_N out
    f_nutrient_ratio_basin_PE(dataset, "DBO5_out", "NGL_out") %>%
    rename(DBO5_N_ratio_out = ratio) %>%
    #DBO5_P in
    f_nutrient_ratio_basin_PE(dataset, "DBO5_in", "Pt_in") %>%
    rename(DBO5_P_ratio_in = ratio) %>%
    #DBO5_P out
    f_nutrient_ratio_basin_PE(dataset, "DBO5_out", "Pt_out") %>%
    rename(DBO5_P_ratio_out = ratio) %>%
    #DCO_N in
    f_nutrient_ratio_basin_PE(dataset, "DCO_in", "NGL_in") %>%
    rename(DCO_N_ratio_in = ratio) %>%
    #DCO_N out
    f_nutrient_ratio_basin_PE(dataset, "DCO_out", "NGL_out") %>%
    rename(DCO_N_ratio_out = ratio) %>%
    #DCO_P in
    f_nutrient_ratio_basin_PE(dataset, "DCO_in", "Pt_in") %>%
    rename(DCO_P_ratio_in = ratio) %>%
    #DCO_P out
    f_nutrient_ratio_basin_PE(dataset, "DCO_out", "Pt_out") %>%
    rename(DCO_P_ratio_out = ratio) %>%
    
    #capacity ratios
    #Pt_PE_in
    f_nutrient_ratio_basin_PE(dataset, "Pt_in", "capacity") %>%
    rename(Pt_PE_ratio_in = ratio) %>%
    #Pt_PE_out
    f_nutrient_ratio_basin_PE(dataset, "Pt_out", "capacity") %>%
    rename(Pt_PE_ratio_out = ratio) %>%
    #NGL_PE_in
    f_nutrient_ratio_basin_PE(dataset, "NGL_in", "capacity") %>%
    rename(NGL_PE_ratio_in = ratio) %>%
    #NGL_PE_out
    f_nutrient_ratio_basin_PE(dataset, "NGL_out", "capacity") %>%
    rename(NGL_PE_ratio_out = ratio) %>%
    #DBO5_PE_in
    f_nutrient_ratio_basin_PE(dataset, "DBO5_in", "capacity") %>%
    rename(DBO5_PE_ratio_in = ratio) %>%
    #DBO5_PE_out
    f_nutrient_ratio_basin_PE(dataset, "DBO5_out", "capacity") %>%
    rename(DBO5_PE_ratio_out = ratio) %>%
    #DCO_PE_in
    f_nutrient_ratio_basin_PE(dataset, "DCO_in", "capacity") %>%
    rename(DCO_PE_ratio_in = ratio) %>%
    #DCO_PE_out
    f_nutrient_ratio_basin_PE(dataset, "DCO_out", "capacity") %>%
    rename(DCO_PE_ratio_out = ratio) %>%
    #MES_PE_in
    f_nutrient_ratio_basin_PE(dataset, "MES_in", "capacity") %>%
    rename(MES_PE_ratio_in = ratio) %>%
    #MES_PE_out
    f_nutrient_ratio_basin_PE(dataset, "MES_out", "capacity") %>%
    rename(MES_PE_ratio_out = ratio) %>%
    
    #convert from kg per PE per day to g per PE per day
    mutate(
      across(
        c(
          Pt_PE_ratio_in, Pt_PE_ratio_out, NGL_PE_ratio_in, NGL_PE_ratio_out, DBO5_PE_ratio_in, DBO5_PE_ratio_out,
          DCO_PE_ratio_in, DCO_PE_ratio_out, MES_PE_ratio_in, MES_PE_ratio_out
        ), ~.x*1000 
      )
    )
  return(basin_PE)
}

basin_N_P_artois_picardie <- f_all_nutrient_ratios_basin(basin_N_P_artois_picardie, N_P_artois_picardie)

basin_PE_N_P_artois_picardie <- f_all_nutrient_ratios_basin_PE(basin_PE_N_P_artois_picardie, N_P_artois_picardie)

We compute the yields at the basin scale, and also by capacity categories.

Code
#have to do this in case inflow or outflow is more reported than the other one, which would create a bias if we took the ratio of the already aggregated flows
f_yield_basin <- function(basin, dataset, nutrientIN, nutrientOUT){
  temp <- dataset %>% 
    filter(
      is.na(!!as.symbol(nutrientIN))==F & is.na(!!as.symbol(nutrientOUT))==F
      ) %>%
    group_by(Year) %>%
    summarise(
      nutrient_in = sum(!!as.symbol(nutrientIN), na.rm=T),
      nutrient_out = sum(!!as.symbol(nutrientOUT), na.rm=T),
      yield = round((1-nutrient_out/nutrient_in)*100, digits = 0)
    ) %>%
    select(-nutrient_in, -nutrient_out)
  basin <- left_join(
    basin, temp, by="Year"
  )
  return(basin)
}

f_yield_basin_PE <- function(basin_PE, dataset, nutrientIN, nutrientOUT){
  temp <- dataset %>% 
    filter(
      is.na(!!as.symbol(nutrientIN))==F & is.na(!!as.symbol(nutrientOUT))==F
      ) %>%
    group_by(Year, PE_bin) %>%
    summarise(
      nutrient_in = sum(!!as.symbol(nutrientIN), na.rm=T),
      nutrient_out = sum(!!as.symbol(nutrientOUT), na.rm=T),
      yield = round((1-nutrient_out/nutrient_in)*100, digits = 0)
    ) %>%
    select(-nutrient_in, -nutrient_out)
  basin_PE <- left_join(
    basin_PE, temp, by=c("Year", "PE_bin")
  )
  return(basin_PE)
}

f_all_yields_basin <- function(basin, dataset){
  basin <- basin %>%
    #NGL yield
    f_yield_basin(dataset, "NGL_in", "NGL_out") %>%
    rename(NGL_yield = yield) %>%
    #Pt yield
    f_yield_basin(dataset, "Pt_in", "Pt_out") %>%
    rename(Pt_yield = yield) %>%
    #DBO5 yield
    f_yield_basin(dataset, "DBO5_in", "DBO5_out") %>%
    rename(DBO5_yield = yield) %>%
    #DCO yield
    f_yield_basin(dataset, "DCO_in", "DCO_out") %>%
    rename(DCO_yield = yield) %>%
    #MES yield
    f_yield_basin(dataset, "MES_in", "MES_out") %>%
    rename(MES_yield = yield) 
  return(basin)
}

f_all_yields_basin_PE <- function(basin_PE, dataset){
  basin_PE <- basin_PE %>%
    #NGL yield
    f_yield_basin_PE(dataset, "NGL_in", "NGL_out") %>%
    rename(NGL_yield = yield) %>%
    #Pt yield
    f_yield_basin_PE(dataset, "Pt_in", "Pt_out") %>%
    rename(Pt_yield = yield) %>%
    #DBO5 yield
    f_yield_basin_PE(dataset, "DBO5_in", "DBO5_out") %>%
    rename(DBO5_yield = yield) %>%
    #DCO yield
    f_yield_basin_PE(dataset, "DCO_in", "DCO_out") %>%
    rename(DCO_yield = yield) %>%
    #MES yield
    f_yield_basin_PE(dataset, "MES_in", "MES_out") %>%
    rename(MES_yield = yield) 
  return(basin_PE)
}

basin_N_P_artois_picardie <- f_all_yields_basin(basin_N_P_artois_picardie, N_P_artois_picardie)

basin_PE_N_P_artois_picardie <- f_all_yields_basin_PE(basin_PE_N_P_artois_picardie, N_P_artois_picardie)

We create the years categories (every 5 years).

Code
#function to create years categories
f_year_categories <- function(dataset){
  dataset <- dataset %>%
    mutate(
      Year_category = case_when(
        Year %in% c(1991, 1992, 1993, 1994, 1995) ~ "1991-1995",
        Year %in% c(1996, 1997, 1998, 1999, 2000) ~ "1996-2000",
        Year %in% c(2001, 2002, 2003, 2004, 2005) ~ "2001-2005",
        Year %in% c(2006, 2007, 2008, 2009, 2010) ~ "2006-2010",
        Year %in% c(2011, 2012, 2013, 2014, 2015) ~ "2011-2015",
        Year %in% c(2016, 2017, 2018, 2019, 2020) ~ "2016-2020",
      )
    )
  return(dataset)
}
N_P_artois_picardie <- f_year_categories(N_P_artois_picardie)
basin_N_P_artois_picardie <- f_year_categories(basin_N_P_artois_picardie)
basin_PE_N_P_artois_picardie <- f_year_categories(basin_PE_N_P_artois_picardie)

There is a reporting discontinuity in 2008. Before this date, DCO and DBO5 are not reported. On the contrary, starting 2008, NO is no more reported.

Moreover before 2008 the reported nutrient inflows do not match the relation Inflow = Outflow + Removed. We recomputed the inflows based on this relation, and obtained a consistent continuity for the transition in 2008, and after 2008. For the following we will thus use or computed quantity.

NGL is not reported, so we compute it. For the whole period, incoming NGL is approximated by incoming NTK (since NO is negligible for incoming flows). For outgoing NGL, we can compute NGL = NTK + NO only for before 2008 ; since NO is not reported after 2007 we cannot compute outgoing NGL this way after this date. The approximation used is detailed in the tab NGL out after 2007.

Code
f_graph_nutrient <- function(dataset, nutrient_in, nutrient_out, nutrient_removed, nutrient_in_computed, label){
  p <- ggplot(dataset) + 
    #nutrient outflow
    geom_ribbon(
      aes(
        Year, ymin=0, 
        ymax = !!as.symbol(nutrient_out), 
        fill = nutrient_out
        ),
      alpha=.8
      ) +
    #nutrient removed
    geom_ribbon(
      aes(
        Year, ymin=!!as.symbol(nutrient_out), 
        ymax =  !!as.symbol(nutrient_out) + !!as.symbol(nutrient_removed), 
        fill = nutrient_removed
        ),
      alpha=.8
      ) +
    #nutrient inflow computed
    geom_line(
      aes(
        Year, 
        !!as.symbol(nutrient_in), 
        linetype="inflow reported"
        )
      ) + 
    #nutrient inflow reported
    geom_line(
      aes(
        Year, 
        !!as.symbol(nutrient_in_computed), 
        linetype="inflow computed"
        )
      ) + 
    ylim(0, NA) +
    theme(
      legend.title = element_blank()
      ) +
    labs(
      x="", y=paste("kt of", label) , 
      title = paste("Reported", label, "flows in Artois-Picardie WWTPs") ,
      subtitle = "reported, not necessarily actual ; here before data cleaning", 
      caption = Source
      )
  return(p)
}
Code
f_graph_nutrient(basin_N_P_artois_picardie, "Pt_in_reported", "Pt_out", "Pt_removed","Pt_in", "Pt")

Code
f_graph_nutrient(basin_N_P_artois_picardie, "NTK_in_reported", "NTK_out", "NTK_removed","NTK_in", "NTK")

DCO is not reported before 2008.

Code
f_graph_nutrient(basin_N_P_artois_picardie, "DCO_in_reported", "DCO_out", "DCO_removed","DCO_in", "DCO")

Code
#remove 0 values of DCO before 2008
basin_N_P_artois_picardie$DCO_in[basin_N_P_artois_picardie$Year<2008] <- NA
basin_N_P_artois_picardie$DCO_out[basin_N_P_artois_picardie$Year<2008] <- NA
basin_PE_N_P_artois_picardie$DCO_in[basin_PE_N_P_artois_picardie$Year<2008] <- NA
basin_PE_N_P_artois_picardie$DCO_out[basin_PE_N_P_artois_picardie$Year<2008] <- NA
N_P_artois_picardie$DCO_in[N_P_artois_picardie$Year<2008] <- NA
N_P_artois_picardie$DCO_out[N_P_artois_picardie$Year<2008] <- NA

DBO5 is not reported before 2008

Code
f_graph_nutrient(basin_N_P_artois_picardie, "DBO5_in_reported", "DBO5_out", "DBO5_removed","DBO5_in", "DBO5")

Code
#remove 0 values of DBO5 before 2008
basin_N_P_artois_picardie$DBO5_out[basin_N_P_artois_picardie$Year<2008] <- NA
basin_N_P_artois_picardie$DBO5_in[basin_N_P_artois_picardie$Year<2008] <- NA
basin_PE_N_P_artois_picardie$DBO5_out[basin_PE_N_P_artois_picardie$Year<2008] <- NA
basin_PE_N_P_artois_picardie$DBO5_in[basin_PE_N_P_artois_picardie$Year<2008] <- NA
N_P_artois_picardie$DBO5_out[N_P_artois_picardie$Year<2008] <- NA
N_P_artois_picardie$DBO5_in[N_P_artois_picardie$Year<2008] <- NA
Code
f_graph_nutrient(basin_N_P_artois_picardie, "MES_in_reported", "MES_out", "MES_removed","MES_in", "MES")

As we said, before 2008, incoming flows are unreliable and overestimated. But for NO, contrary to the other flows, we cannot compute NO in = NO out + NO removed, because NO is “created” during the treatment through nitrification.

For NGL, which is equal to NO + NTK, we can see that incoming NO is negligible compared to NTK, so we approximate incoming NGL by incoming NTK.

Code
ggplot(basin_N_P_artois_picardie) +
  geom_ribbon(aes(Year, ymin=0, ymax = NTK_in, fill="NTK in computed")) +
  geom_ribbon(aes(Year, ymin=NTK_in, ymax = NTK_in + NO_in_reported, fill="NO in reported")) +
  geom_line(aes(Year, NGL_in), linetype="dashed") +
  annotate(geom="text", x=1997, y=15, label="NGL in ~ NTK in") +
  theme(
    legend.title = element_blank()
  ) +
  labs(
    x="", y="ktN", caption=Source,
    title= "N inflows in Artois-Picardie basin WWTPs"
  )

We can compute NGL out = NTK out + NO out before 2008, when NO is reported. For 2000-2008 the relations holds true at the basin scale. Before that, stations sometimes only report NTK out or NO out, and in that case we do not compute NGL out, explaining the apparent discrepancy.

After that date, it is not possible to assess NGL (contrary to incoming flows, here NO is not negligible).

Code
#we remove th 0 values of NGL out starting 2008
basin_N_P_artois_picardie$NGL_out[basin_N_P_artois_picardie$Year>2007] <- NA
N_P_artois_picardie$NGL_out[N_P_artois_picardie$Year>2007] <- NA
# basin_N_P_artois_picardie$NGL_out[basin_N_P_artois_picardie$Year==1991] <- NA
# N_P_artois_picardie$NGL_out[N_P_artois_picardie$Year==1991] <- NA

#idem for NGL yield
basin_N_P_artois_picardie$NGL_yield[basin_N_P_artois_picardie$Year>2007] <- NA
N_P_artois_picardie$NGL_yield[N_P_artois_picardie$Year>2007] <- NA
Code
ggplot(basin_N_P_artois_picardie) +
  geom_ribbon(aes(Year, ymin=0, ymax = NTK_out, fill="NTK out")) +
  geom_ribbon(aes(Year, ymin=NTK_out, ymax = NTK_out+NO_out, fill="NO out")) +
  annotate(geom="text", x=2000, y=8, label="NGL out = NTK out + NO out", hjust=0) +
  geom_line(aes(Year, NGL_out), linetype="dashed")+
  theme(
    legend.title = element_blank()
  ) +
  labs(
    x="", y="ktN", caption=Source,
    title= "N outflow in Artois-Picardie basin WWTPs"
  )

We try to extrapolate outgoing NO after 2007 at the basin scale, by looking for a constant ratio on the period 1990-2007. NO out / NGL_in roughly meets this criteria, even though the NGL yield drastically changed over the period (see Yields section).

The ratio seems to begin to be roughly consistent over the different capacities starting 2004. We use this ratio averaged over 2004-2007 to compute outgoing NO for 2008-2020, and thus outgoing NGL as a first approximation.

With this we compute, at the basin scale, the NGL yield starting 2008 as well as the nutrient ratios involving NGL. We will imrove this in the future.

Code
#compute NO/NGL out at the basin scale
temp <- N_P_artois_picardie %>% 
  filter(
    Year<2008,
    is.na(NO_out)==F & is.na(NGL_in)==F #only when both are reported
    ) %>%
  group_by(Year, PE_bin) %>%
  summarise(
    NO_out = sum(NO_out, na.rm=T),
    NGL_in = sum(NGL_in, na.rm=T),
    ratio = NO_out/NGL_in
  ) 

ggplot(temp) +
  geom_line(aes(Year, ratio, color = PE_bin)) +
  labs(
    x="", y="",
    title = "Ratio at the basin scale",
    subtitle = "Outgoing NO / Incoming NGL",
    caption = Source,
    color = "WWTP capacity"
  ) +
  ylim(0, 0.2)

Code
ratio_NOout_NGLin <- mean(temp %>% filter(Year>=2004) %>% pull(ratio))

We recompute the yields and ratios at the WWTP scale and the basin values based on these modifications.

Code
#Compute NO out and NGL out
N_P_artois_picardie <- N_P_artois_picardie %>%
  mutate(
    NO_out = 
      case_when(
        Year>2007 ~ NGL_in*ratio_NOout_NGLin,
        T~ NO_out
        ),
    NGL_out = NO_out + NTK_out
  )

#recompute basin values (flows, yields, ratios..)
basin_N_P_artois_picardie <- f_basin_flows(N_P_artois_picardie)
basin_N_P_artois_picardie <- f_all_nutrient_ratios_basin(basin_N_P_artois_picardie, N_P_artois_picardie)
basin_N_P_artois_picardie <- f_all_yields_basin(basin_N_P_artois_picardie, N_P_artois_picardie)
basin_N_P_artois_picardie <- f_year_categories(basin_N_P_artois_picardie)

#recompute basin x PE values (flows, yields, ratios..)
basin_PE_N_P_artois_picardie <- f_basin_PE_flows(N_P_artois_picardie)
basin_PE_N_P_artois_picardie <- f_all_nutrient_ratios_basin_PE(basin_PE_N_P_artois_picardie, N_P_artois_picardie)
basin_PE_N_P_artois_picardie <- f_all_yields_basin_PE(basin_PE_N_P_artois_picardie, N_P_artois_picardie)
basin_PE_N_P_artois_picardie <- f_year_categories(basin_PE_N_P_artois_picardie)

#computes WWTP ratios and NGL yield
N_P_artois_picardie <- N_P_artois_picardie %>%
  mutate(
    #yield
    NGL_yield = (1-NGL_out/NGL_in),
    #ratios
    N_P_ratio_in = NGL_in/Pt_in,
    N_P_ratio_out = NGL_in/Pt_out,
    DCO_DBO5_ratio_in = DCO_in/DBO5_in,
    DCO_DBO5_ratio_out = DCO_out/DBO5_out,
    DBO5_NGL_ratio_in = DBO5_in/NGL_in,
    DBO5_NGL_ratio_out = DBO5_out/NGL_out,
    DBO5_Pt_ratio_in = DBO5_in/Pt_in,
    DBO5_Pt_ratio_out = DBO5_out/Pt_out
  )

#we replace 0 by empty values for basin file whe not reported (before 2008)
basin_N_P_artois_picardie$DBO5_in[basin_N_P_artois_picardie$Year<2008] <- NA
basin_N_P_artois_picardie$DBO5_out[basin_N_P_artois_picardie$Year<2008] <- NA
basin_N_P_artois_picardie$DCO_in[basin_N_P_artois_picardie$Year<2008] <- NA
basin_N_P_artois_picardie$DCO_out[basin_N_P_artois_picardie$Year<2008] <- NA

#we replace 0 by empty values for basin x PEfile whe not reported (before 2008)
basin_PE_N_P_artois_picardie$DBO5_in[basin_PE_N_P_artois_picardie$Year<2008] <- NA
basin_PE_N_P_artois_picardie$DBO5_out[basin_PE_N_P_artois_picardie$Year<2008] <- NA
basin_PE_N_P_artois_picardie$DCO_in[basin_PE_N_P_artois_picardie$Year<2008] <- NA
basin_PE_N_P_artois_picardie$DCO_out[basin_PE_N_P_artois_picardie$Year<2008] <- NA

We only keep the consistent computed values in our file, so we remove the reported inflows.

Code
N_P_artois_picardie <- N_P_artois_picardie %>%
  select(
    #remove inconsistent reported values
    -MES_in_reported, -DBO5_in_reported, -DCO_in_reported, -NGL_in_reported, -Pt_in_reported, -NO_in_reported, -NTK_in_reported,
    #remove redundant "removed" values (can easily be computed)
    -MES_removed, -DBO5_removed, -DCO_removed, -Pt_removed, -NO_removed, -NTK_removed
    )

basin_N_P_artois_picardie <- basin_N_P_artois_picardie %>%
  select(
    #remove inconsistent reported values
    -MES_in_reported, -DBO5_in_reported, -DCO_in_reported, -NGL_in_reported, -Pt_in_reported, -NO_in_reported, -NTK_in_reported,
    #remove redundant "removed" values (can easily be computed)
    -MES_removed, -DBO5_removed, -DCO_removed, -Pt_removed, -NO_removed, -NTK_removed
    )

basin_PE_N_P_artois_picardie <- basin_PE_N_P_artois_picardie %>%
  select(
    #remove inconsistent reported values
    -MES_in_reported, -DBO5_in_reported, -DCO_in_reported, -NGL_in_reported, -Pt_in_reported, -NO_in_reported, -NTK_in_reported,
    #remove redundant "removed" values (can easily be computed)
    -MES_removed, -DBO5_removed, -DCO_removed, -Pt_removed, -NO_removed, -NTK_removed
    )

Data cleaning

Code
f_graph_nutrient <- function(dataset, nutrient_in, nutrient_out, label, legend_x, legend_y){
  p <- ggplot(dataset) + 
    #nutrient inflow
    geom_line(
      aes(
        Year, 
        !!as.symbol(nutrient_in), 
        color=nutrient_in
        )
      ) + 
    #nutrient outflow
    geom_line(
      aes(
        Year,
        !!as.symbol(nutrient_out), 
        color = nutrient_out
        )
      ) +
    ylim(0, NA) +
    theme(
      legend.position = c(legend_x, legend_y), 
      legend.title = element_blank()
      ) +
    labs(
      x="", y=paste("kt of", label) , 
      title = paste("Reported", label, "flows in Artois-Picardie WWTPs") ,
      subtitle = "reported, not necessarily actual ; here before data cleaning", 
      caption = Source
      )
  return(p)
}

We plot the flows at the basin scale to try to see important outliers. For now we do not spot any obvious outlier, so there is no need for correction.

Code
f_graph_nutrient(basin_N_P_artois_picardie, "NGL_in", "NGL_out", "NGL", 0.7, 0.5) +
  labs(subtitle = "After 2007, NGL out is only approximate !")

Code
f_graph_nutrient(basin_N_P_artois_picardie, "NTK_in", "NTK_out", "NTK", 0.7, 0.5) 

Code
f_graph_nutrient(basin_N_P_artois_picardie, "Pt_in", "Pt_out", "Pt", 0.4, 0.5) 

Code
f_graph_nutrient(basin_N_P_artois_picardie, "DBO5_in", "DBO5_out", "DBO5", 0.4, 0.5)  

Code
f_graph_nutrient(basin_N_P_artois_picardie, "DCO_in", "DCO_out", "DCO", 0.4, 0.5)  

Code
f_graph_nutrient(basin_N_P_artois_picardie, "MES_in", "MES_out", "MES", 0.4, 0.5)

Capacities distribution

Code
#file for graph
temp <- N_P_artois_picardie %>%
  group_by(Year) %>%
  summarise(
    capacity = sum(capacity, na.rm = T)/10^6, #capacity in million PE
    nb_WWTP = n()
  )

year_min <- min(temp$Year)
year_max <- max(temp$Year)
n_min <- first(temp$nb_WWTP)
n_max <- last(temp$nb_WWTP)
capacity_min <- round(first(temp$capacity), digits=1)
capacity_max <- round(last(temp$capacity), digits=1)

Even though the number of listed plants in the data base increases from 330 to 560 (a 70% increase) between 1992 and 2018, the total capacity only increases by 10% from 5.8 to 6.4 million Population Equivalent.

This highlights the fact that unreported plants are mostly small and that the plant size distribution is highly skewed, which is discussed in the following 3 tabs.

Code
coef <- max(temp$capacity)/max(temp$nb_WWTP)
ggplot(temp) +
  geom_line(
    aes(
      Year, nb_WWTP, 
      color = "number of reported facilities (left)"
      )
    ) + 
  geom_line(
    aes(
      Year, capacity/coef, 
      color = "total reported capacity (right)"
      )
    ) + 
  scale_y_continuous(
    limits = c(0, NA),
    sec.axis = 
      sec_axis(
        trans=~.*coef, 
        name="million Population Equivalent"
        )
    ) +
  labs(
    title = "Evolution of the reporting in the database",
    subtitle = "in terms of number of WWTP reported and total reported capacity",
    y="", x="", 
    color="", caption =Source
  ) +
  theme(
    legend.position = c(0.7, 0.5)
  )

About half of the total capacity is from WWTP larger than 100 000 population equivalent. About 80-90% is due to WWTP larger than 10 000 population equivalent.

Code
temp <- N_P_artois_picardie %>%
  filter(is.na(capacity)==F) %>%
  select(Year, capacity, PE_bin) %>%
  group_by(Year, PE_bin) %>%
  summarise(
    `capacity (million PE)` = sum(capacity)/10^6,
    `number of stations` = n()
  ) %>% 
  gather(key=capacity_or_n, value = value, `capacity (million PE)`, `number of stations`)
Code
ggplot(temp) + 
  geom_area(aes(Year, value, fill=PE_bin), alpha=.8) + 
  facet_wrap(vars(capacity_or_n), scales="free") + 
  labs(
    title="Reporting in the database",
    subtitle = "For each capacity category",
    x="", y="", fill="nominal capacity \n(Population Equivalent)",
    caption = Source
  )

Code
ggplot(temp) + 
  geom_area(aes(Year, value, fill=PE_bin), position = "fill", alpha=.8) + 
  facet_wrap(vars(capacity_or_n), scales="free") + 
  labs(
    title="Reporting in the database",
    subtitle = "Proportion of each capacity category",
    x="", y="", fill="nominal capacity \n(Population Equivalent)",
    caption = Source
  )

Code
temp <- N_P_artois_picardie %>% filter(Year==Year_analysis)
ggplot(temp) + 
  geom_histogram(
    aes(
      capacity, 
      fill = "Nb of facilities"
      ), 
    n=100, alpha=.4, stat="density"
    ) +
  geom_histogram(
    aes(
      capacity, weight = capacity, 
      fill="Nb of facilities weighted by capacity"
      ), 
    n=100, alpha=.4, stat="density"
    ) +
  theme(
    legend.position = c(0.7,0.8),
  ) +
  labs(
    x="Waste Water Treatment Plant Capacity \n(Population Equivalent)",
    y="Distribution density",
    fill="Distribution of",
    title = paste("WWTP capacities distribution", as.character(Year_analysis)),
    subtitle = "raw or weighted by capacity"
  ) +
  scale_x_log10(
    labels = scales::label_number(drop0trailing = TRUE)
    )

About 1% of the WWTP represent 25% of the total capacity ; 4% represent 50% of the capacity ; 10% represent 75%

Code
temp <- N_P_artois_picardie %>% 
  ungroup() %>%
  filter(Year==Year_analysis) %>% 
  select(code_WWTP, name_WWTP, capacity) %>%
  filter(is.na(capacity) == F) %>%
  arrange(desc(capacity)) %>%
  mutate( 
    cumulative_capacity = cumsum(capacity)/10^6,
    rank_STEU = rank(-capacity, ties.method = "first"),
    percent_cumulative_capacity = round(cumulative_capacity/sum(capacity/10^6)*100, digits = 1),
    percent_rank = round(rank_STEU/n()*100, digits = 1)
    ) 

f_save_csv_files(
  temp %>% mutate(basin = "Artois-Picardie"), 
  "output_data/zipf_law/",
  "zipf_law_01_artois_picardie.csv"
)

coef <- max(temp$rank_STEU)/100
coef2 <- max(temp$cumulative_capacity)/100
Code
ggplot(temp) +
  geom_step(
    aes(
      x = percent_rank, y = percent_cumulative_capacity
      )
    ) + 
  labs(
    title = paste("Cumulative distribution,", Year_analysis),
    subtitle="nb of WWTP vs total capacity",
    x="% of WWTP", y="% of total capacity",
    caption = Source
  ) +
  scale_x_continuous(
    sec.axis = 
      sec_axis(
        trans=~.*coef, name="nb of WWTP",
        labels = scales::label_number(drop0trailing = TRUE)
        )
    ) + 
  scale_y_continuous(
    sec.axis = 
      sec_axis(
        trans=~.*coef2, 
        name="cumulative capacity \n(millions PE)"
        )
    ) + 
    theme(legend.position = "none")

Code
ggplot(temp) +
  geom_step(
    aes(
      x = percent_rank, y = percent_cumulative_capacity
      )
    ) + 
  labs(
    title = paste("Cumulative distribution,", Year_analysis),
    subtitle="nb of WWTP vs total capacity",
    x="% of WWTP", y="% of total capacity",
    caption = Source
  ) +
  scale_x_log10(
    labels = scales::label_number(drop0trailing = TRUE),
    sec.axis = 
      sec_axis(
        trans=~.*coef, 
        name="nb of WWTP",
        labels = scales::label_number(drop0trailing = TRUE)
        )
    ) + 
  scale_y_continuous(
    sec.axis = 
      sec_axis(
        trans=~.*coef2, 
        name="cumulative capacity \n(millions Population Equivalent)"
        )
    ) + 
    theme(legend.position = "none")

Code
ggplot(temp) +
  geom_point(
    aes(
      x = rank_STEU, y = capacity
      )
    ) + 
  labs(
    title = paste("WWTP capacity vs rank,", Year_analysis),
    subtitle = "looking for a Zipf law",
    x="Waste Water Treatment Plant \n(ranked by capacity)",
    y=" Waste Water Treatment Plant capacity\n(Population Equivalent)"
    ) +
  scale_x_log10(
    labels = scales::label_number(drop0trailing = TRUE)
    ) + 
  scale_y_log10(
    labels = scales::label_number(drop0trailing = TRUE)
    )

Pollution flows

Navigate through tabs below to see details for each pollutant. For each pollutant, we present reporting for incoming (left) and outgoing (right) pollution, in terms of number of WWTP reporting the data (bottom) or in terms of installed capacity (top).

Pollution reporting is excellent for NTK, Pt, MES. Starting 2008, DBO5 and DCO are also reported very well. By construction incoming NGL is equal to incoming NTK, so it is equally reported. See how is constructed NGL in the first paragraph #Data quality and description*.

Besides reporting of specific pollutants, one must keep in mind that the number of reported WWTP increases over time, which might create a bias in total quantities reported. However, most of the newly reported facilities are small one, as shown by the rather constant capacity starting 2000.

Code
#function for plots : to be finished
f_graph_reporting_nutrients <- function(pollution_in, pollution_out){
  temp <- N_P_artois_picardie %>%
    select(
      Year, capacity, 
      !!as.symbol(pollution_in), !!as.symbol(pollution_out)
      ) %>%
    mutate(
      nutrient_in = is.na(!!as.symbol(pollution_in))==F,
      nutrient_out = is.na(!!as.symbol(pollution_out))==F
      ) %>%
    gather(
      key=in_out_flow, 
      value = `reported pollution`, 
      nutrient_in, nutrient_out
      ) %>%
    group_by(
      Year, in_out_flow, `reported pollution`
      ) %>%
    summarise(
      `number of WWTP`=n(), 
      `capacity (million PE)` = sum(capacity, na.rm=T)/10^6
      ) %>%
    gather(
      key=n_or_capacity, 
      value = value, 
      `number of WWTP`, `capacity (million PE)`
      ) %>%
    #renaming labels
    mutate(
      in_out_flow = case_when(
        in_out_flow == "nutrient_in" ~ pollution_in,
        in_out_flow == "nutrient_out" ~ pollution_out,
      )
    )

  g <- ggplot(temp) +
    geom_area(aes(Year, value, fill=`reported pollution`)) +
    facet_grid(
      n_or_capacity~in_out_flow, 
      scales="free_y", switch = "y") +
    labs(
      y="", x="",
      title = "Reporting of nutrient inflows (left) and outflows (right)",
      subtitle = "In terms of total capacity (top) and nb of WWTP (bottom)",
      caption = Source
      ) 

  return(g)
}
Code
f_graph_reporting_nutrients("Pt_in", "Pt_out")

Code
f_graph_reporting_nutrients("NTK_in", "NTK_out")

Code
f_graph_reporting_nutrients("NGL_in", "NGL_out") +
  labs(subtitle = "after 2007, NGL out is actually extrpolated from NGL in and NTK out!")

Code
f_graph_reporting_nutrients("DBO5_in", "DBO5_out")

Code
f_graph_reporting_nutrients("DCO_in", "DCO_out")

Code
f_graph_reporting_nutrients("MES_in", "MES_out")

The same as in Data cleaning -> Outliers: first visualization because we did not have to correct major outliers in the Artois-Picardie basin.

We extrapolate the flows at the basin scale for the plants not reporting them. For that, we use a coefficient proportionate to the unreported capacity of the given nutrient flow (see data quality tab).

We compute in terms of installed capacity the reported and unreported flows for NGL, Pt, DBO5, DCO and MES. We do this for each year and for each capacity category.

Code
#create file of reported 
temp <- N_P_artois_picardie %>%
  select(
    Year, PE_bin, capacity, 
    Pt_in, Pt_out, NGL_in, NGL_out, DBO5_in, DBO5_out, DCO_in, DCO_out, MES_in, MES_out
    ) %>%
  #spots unreported values for each nutrient flow
  mutate(
    across(
      c(Pt_in, Pt_out, NGL_in, NGL_out, DBO5_in, DBO5_out, DCO_in, DCO_out, MES_in, MES_out),
      ~ is.na(.x)==F
      )
    ) %>%
  #gather to be able to then group by flow and count capacity
  gather(
    key=nutrient_flow, 
    value = reported_pollution, 
    Pt_in, Pt_out, NGL_in, NGL_out, DBO5_in, DBO5_out, DCO_in, DCO_out, MES_in, MES_out
    )  %>%
  #count reported capacity and unreported capacity for each (Year, capacity category, nutrient flow)
  group_by(
    Year, PE_bin, nutrient_flow, reported_pollution
    ) %>%
  summarise(
    capacity = sum(capacity, na.rm=T)/10^6
    ) %>%
  #creates reported/unreported names for each nutrient flow and spreads into columns
  mutate(
    nutrient_flow = case_when(
      reported_pollution == T ~ paste0(nutrient_flow, "_reported"),
      reported_pollution == F ~ paste0(nutrient_flow, "_unreported")
    )
  ) %>%
  select(-reported_pollution) %>%
  spread(nutrient_flow, capacity)

# NA values replaced by 0 for futur coeff computation
temp[is.na(temp)] <- 0

From this we compute proportionate coefficient to extrapolate real flows.

Code
temp <- temp %>%
  mutate(
    #Pt
    Pt_in_coeff = (Pt_in_reported + Pt_in_unreported)/Pt_in_reported,
    Pt_out_coeff = (Pt_out_reported + Pt_out_unreported)/Pt_out_reported,
    #NGL
    NGL_in_coeff = (NGL_in_reported + NGL_in_unreported)/NGL_in_reported,
    NGL_out_coeff = (NGL_out_reported + NGL_out_unreported)/NGL_out_reported,
    #DBO5
    DBO5_in_coeff = (DBO5_in_reported + DBO5_in_unreported)/DBO5_in_reported,
    DBO5_out_coeff = (DBO5_out_reported + DBO5_out_unreported)/DBO5_out_reported,
    #DCO
    DCO_in_coeff = (DCO_in_reported + DCO_in_unreported)/DCO_in_reported,
    DCO_out_coeff = (DCO_out_reported + DCO_out_unreported)/DCO_out_reported,
    #MES
    MES_in_coeff = (MES_in_reported + MES_in_unreported)/MES_in_reported,
    MES_out_coeff = (MES_out_reported + MES_out_unreported)/MES_out_reported,
  ) %>%
  select(
    -c(
      Pt_in_reported, Pt_in_unreported,
      Pt_out_reported, Pt_out_unreported,
      NGL_in_reported, NGL_in_unreported,
      NGL_out_reported, NGL_out_unreported,
      DBO5_in_reported, DBO5_in_unreported,
      DBO5_out_reported, DBO5_out_unreported,
      DCO_in_reported, DCO_in_unreported, 
      DCO_out_reported, DCO_out_unreported, 
      MES_in_reported, MES_in_unreported, 
      MES_out_reported, MES_out_unreported
      )
    )

#replace Inf values by 1 (check if it does not create problems)
temp[temp == Inf] <- 1

With these coefficients we compute the adjusted flows

Code
#file with reported flows and adjustment coefficient
temp2 <- left_join(
  basin_PE_N_P_artois_picardie %>%
    #selects only flows and not yields or ratios
    select(
      Year, PE_bin, 
      Pt_in, Pt_out, NGL_in, NGL_out, DBO5_in, DBO5_out, DCO_in, DCO_out, MES_in, MES_out
    ),
  temp, by=c("Year", "PE_bin")
)

#computes adjusted flows
temp2 <- temp2 %>%
  mutate(
    #Pt
    Pt_in_adj = round(Pt_in_coeff*Pt_in, 5),
    Pt_out_adj = round(Pt_out_coeff*Pt_out, 5),
    #NGL
    NGL_in_adj = round(NGL_in_coeff*NGL_in, 5),
    NGL_out_adj = round(NGL_out_coeff*NGL_out, 5),
    #DBO5
    DBO5_in_adj = round(DBO5_in_coeff*DBO5_in, 5),
    DBO5_out_adj = round(DBO5_out_coeff*DBO5_out, 5),
    #DCO
    DCO_in_adj = round(DCO_in_coeff*DCO_in, 5),
    DCO_out_adj = round(DCO_out_coeff*DCO_out, 5),
    #MES
    MES_in_adj = round(MES_in_coeff*MES_in, 5),
    MES_out_adj = round(MES_out_coeff*MES_out, 5)
  ) %>%
  #we remove coefficients and unajusted flows
  select(
    -c(
      Pt_in, Pt_in_coeff, Pt_out, Pt_out_coeff,
      NGL_in, NGL_in_coeff, NGL_out, NGL_out_coeff,
      DBO5_in, DBO5_in_coeff, DBO5_out, DBO5_out_coeff,
      DCO_in, DCO_in_coeff, DCO_out, DCO_out_coeff, 
      MES_in, MES_in_coeff, MES_out, MES_out_coeff
    )
  )

We add these adjusted flows to the main files reporting flows at the basin scale

Code
#adding adjusted flows to the basin x capacity files
basin_PE_N_P_artois_picardie <- left_join(
  basin_PE_N_P_artois_picardie, temp2, by=c("Year", "PE_bin")
)

#aggregating adjusted flows at the basin scale without the capacity categories
temp <- temp2 %>%
  select(-PE_bin) %>%
  group_by(Year) %>%
  summarise_all(~signif(sum(.x), 3))

#adding adjusted flows to the basin files
basin_N_P_artois_picardie <- left_join(
  basin_N_P_artois_picardie, temp, by="Year"
)

We plot the comparison reported / adjusted in the following graphs. For the Artois-Picardie basin, the difference is very marginal.

Code
f_graph_adjusted <- function(basin_file, basin_PE_file, nutrient_adjusted, nutrient_reported, nutrient_label){
  g <- plot_grid(
    ggplot(basin_PE_file) +
      geom_line(
        data = basin_file,
        aes(Year, !!as.symbol(nutrient_adjusted)), 
        color="black", size=1
        ) + 
      geom_area(
        aes(Year, !!as.symbol(nutrient_reported), fill=PE_bin), 
        alpha=.7
        ) + 
      theme(legend.position = "none") +
      labs(
        x="", y="kt per year",
        caption = "\n",
        title = paste("Adjusted", nutrient_label, "flows")
    ),
    ggplot(basin_PE_file) +
      geom_line(
        aes(Year, !!as.symbol(nutrient_adjusted), color=PE_bin), 
        size=1
        ) + 
      geom_area(
        aes(Year, !!as.symbol(nutrient_reported), fill=PE_bin), 
        alpha=.7
        ) + 
      theme(legend.position = "none") +
      facet_wrap(vars(PE_bin), scales="free") +
      labs(
        x="", y="",
        caption = Source,
        title = "",
        subtitle = "line: adjusted flow ; area: reported flow"
      ),
    rel_widths = c(0.3, 0.7)
  )
  return(g)
}
Code
f_graph_adjusted(
  basin_N_P_artois_picardie, 
  basin_PE_N_P_artois_picardie,
  "Pt_in_adj", "Pt_in", "incoming Pt"
  )

Code
f_graph_adjusted(
  basin_N_P_artois_picardie, 
  basin_PE_N_P_artois_picardie,
  "Pt_out_adj", "Pt_out", "discharged Pt"
  )

Code
f_graph_adjusted(
  basin_N_P_artois_picardie, 
  basin_PE_N_P_artois_picardie,
  "NGL_in_adj", "NGL_in", "incoming NGL"
  )

Code
f_graph_adjusted(
  basin_N_P_artois_picardie, 
  basin_PE_N_P_artois_picardie,
  "NGL_out_adj", "NGL_out", "discharged NGL"
  )

Code
f_graph_adjusted(
  basin_N_P_artois_picardie, 
  basin_PE_N_P_artois_picardie,
  "DBO5_in_adj", "DBO5_in", "incoming DBO5"
  )

Code
f_graph_adjusted(
  basin_N_P_artois_picardie, 
  basin_PE_N_P_artois_picardie,
  "DBO5_out_adj", "DBO5_out", "discharged DBO5"
  )

Code
f_graph_adjusted(
  basin_N_P_artois_picardie, 
  basin_PE_N_P_artois_picardie,
  "DCO_in_adj", "DCO_in", "incoming DCO"
  )

Code
f_graph_adjusted(
  basin_N_P_artois_picardie, 
  basin_PE_N_P_artois_picardie,
  "DCO_out_adj", "DCO_out", "discharged DCO"
  )

Code
f_graph_adjusted(
  basin_N_P_artois_picardie, 
  basin_PE_N_P_artois_picardie,
  "MES_in_adj", "MES_in", "incoming MES"
  )

Code
f_graph_adjusted(
  basin_N_P_artois_picardie, 
  basin_PE_N_P_artois_picardie,
  "MES_out_adj", "MES_out", "discharged MES"
  )

Ratios

Code
#temporal P/N ratio
ggplot(basin_N_P_artois_picardie) + 
  geom_line(aes(Year, N_P_ratio_in, color="N:P in")) + 
  geom_line(aes(Year, N_P_ratio_out, color = "N:P out")) + 
  ylim(0, NA) +
  theme(
    legend.position = c(0.7, 0.6)
  ) +
  labs(
    x="", y="N:P ratio",
    title = "N:P ratio in Artois-Picardie basin",
    subtitle = "Careful ! After 2007, N outflow is approximate !",
    caption=Source, color=""
  )

Code
ggplot(basin_N_P_artois_picardie) + 
  geom_line(aes(Year, DCO_DBO5_ratio_in, color="DCO:DBO5 in")) + 
  geom_line(aes(Year, DCO_DBO5_ratio_out, color = "DCO:DBO5 out")) + 
  ylim(0, NA) +
  theme(
    legend.position = c(0.7, 0.6)
  ) +
  labs(
    x="", y="DCO:DBO5 ratio",
    title = "DCO:DBO5 ratio in Artois-Picardie basin",
    subtitle = "decrease in outflow shows biodegradation",
    caption=Source, color=""
  )

Code
ggplot(basin_N_P_artois_picardie) +
  geom_point(
    aes(
      DBO5_N_ratio_in, DBO5_P_ratio_in,
      color=Year_category
      )
    ) +
  geom_point(
    aes(
      DBO5_N_ratio_out, DBO5_P_ratio_out,
      color=Year_category
        )
    ) +
  ylim(0, NA) +
  annotate(
    geom="text", label ="inflow",
    x=4, y=25
  ) +
  annotate(
    geom="text", label ="outflow",
    x=1.3, y=8
  ) +
  labs(
    x="DBO5:Pt ratio", y="DBO5:NGL ratio",
    title = "DBO5:NGL vs DBO5:Pt ratio in Artois-Picardie basin WWTPs",
    subtitle = "Careful ! After 2007, NGL out is only approximate",
    caption=Source, color=""
  )

Basin yield

Code
ggplot(basin_N_P_artois_picardie) + 
  geom_line(aes(Year, Pt_yield, color="P")) + 
  geom_line(aes(Year, NGL_yield, color = "N")) + 
  geom_line(aes(Year, DBO5_yield, color = "DBO5")) +
  geom_line(aes(Year, DCO_yield, color = "DCO")) +
  geom_line(aes(Year, MES_yield, color = "MES")) +
  ylim(0,100) +
  theme(legend.position = c(0.7, 0.3)) +
  labs(
    title = "Global abatement rate of Artois-Picardie WWTPs", 
    x="", y="Yield (%)", color="",
    caption = Source
    )

Code
f_graph_yield_PE <- function(dataset, nutrient_yield, nutrient_label){
  g <- ggplot(dataset) + 
    geom_line(aes(Year, !!as.symbol(nutrient_yield), color=PE_bin)) + 
    ylim(0,100) +
    theme(legend.position = c(0.7, 0.3)) +
    labs(
      title = paste("Global", nutrient_label, "yield of Artois-Picardie basin"), 
      subtitle = "by capacity (population equivalent)",
      x="", y="Yield (%)", color="",
      caption = Source
      )
  return(g)
}
Code
f_graph_yield_PE(
  basin_PE_N_P_artois_picardie %>% filter(PE_bin!="0 - 200 PE"), 
  "Pt_yield", "Pt")

Code
f_graph_yield_PE(
  basin_PE_N_P_artois_picardie %>% filter(PE_bin!="0 - 200 PE"), 
  "NGL_yield", "NGL")

Code
f_graph_yield_PE(
  basin_PE_N_P_artois_picardie %>% filter(PE_bin!="0 - 200 PE"), 
  "DBO5_yield", "DBO5")

Code
f_graph_yield_PE(
  basin_PE_N_P_artois_picardie %>% filter(PE_bin!="0 - 200 PE"), 
  "DCO_yield", "DCO")

Code
f_graph_yield_PE(
  basin_PE_N_P_artois_picardie %>% filter(PE_bin!="0 - 200 PE"), 
  "MES_yield", "MES")

Save data

We artificially extend the data in 2019 and 2020 from 2018 value to compare to the other basins on this period.

Code
temp <- N_P_artois_picardie %>% 
  filter(Year==2018)
N_P_artois_picardie <- bind_rows(
  N_P_artois_picardie, 
  temp %>% mutate(Year=2019), 
  temp %>% mutate(Year=2020) 
)

temp <- basin_N_P_artois_picardie %>% 
  filter(Year==2018)
basin_N_P_artois_picardie <- bind_rows(
  basin_N_P_artois_picardie, 
  temp %>% mutate(Year=2019), 
  temp %>% mutate(Year=2020) 
)

temp <- basin_PE_N_P_artois_picardie %>% 
  filter(Year==2018)
basin_PE_N_P_artois_picardie <- bind_rows(
  basin_PE_N_P_artois_picardie, 
  temp %>% mutate(Year=2019), 
  temp %>% mutate(Year=2020) 
)

We save the file for our analysis at the national scale combining all different basins. We do not save data for individual WWTP NGL yield and NO and NGL outflows, since they are approximations valid only at the basin level.

Code
#all WWTP file
path_output <- "output_data/all_WWTP/"
temp <- N_P_artois_picardie %>%
  select(
    code_WWTP, name_WWTP, Year, capacity, name_commune, INSEE_COM, lat_WWTP, long_WWTP, PE_bin,
    DBO5_in, DCO_in, MES_in, NGL_in, NTK_in, Pt_in,
    DBO5_out, DCO_out, MES_out, NTK_out, Pt_out,
    DBO5_yield, DCO_yield, MES_yield, Pt_yield,
  )

f_save_csv_files(
  temp,
  path_output,
  "all_WWTP_01_artois_picardie.csv"
)

#basin agregated file
path_output <- "output_data/basins/"
temp <- basin_N_P_artois_picardie %>%
  select(
    Year, capacity, nb_WWTP,
    #flows reported and adjusted
    Pt_in, Pt_in_adj, NGL_in, NGL_in_adj, 
    DBO5_in, DBO5_in_adj, DCO_in, DCO_in_adj, MES_in, MES_in_adj,
    Pt_out, Pt_out_adj, NGL_out, NGL_out_adj,
    DBO5_out, DBO5_out_adj, DCO_out, DCO_out_adj, MES_out, MES_out_adj,
    #yields
    NGL_yield, Pt_yield, DBO5_yield, DCO_yield, MES_yield, 
    #nutrient ratios
    N_P_ratio_in, N_P_ratio_out,
    DBO5_N_ratio_in, DBO5_N_ratio_out, DBO5_P_ratio_in, DBO5_P_ratio_out,
    DCO_N_ratio_in, DCO_N_ratio_out, DCO_P_ratio_in, DCO_P_ratio_out,
    DCO_DBO5_ratio_in, DCO_DBO5_ratio_out,
    #PE ratios
    Pt_PE_ratio_in, Pt_PE_ratio_out, NGL_PE_ratio_in, NGL_PE_ratio_out, 
    DBO5_PE_ratio_in, DBO5_PE_ratio_out, DCO_PE_ratio_in, DCO_PE_ratio_out, MES_PE_ratio_in, MES_PE_ratio_out
  )
f_save_csv_files(
  temp,
  path_output,
  "basin_01_artois_picardie.csv"
)

#basin x PE agregated file
path_output <- "output_data/basins_PE/"
temp <- basin_PE_N_P_artois_picardie %>%
  select(
    Year, capacity, nb_WWTP, PE_bin, 
    #flows reported and adjusted
    Pt_in, Pt_in_adj, NGL_in, NGL_in_adj, 
    DBO5_in, DBO5_in_adj, DCO_in, DCO_in_adj, MES_in, MES_in_adj,
    Pt_out, Pt_out_adj, NGL_out, NGL_out_adj,
    DBO5_out, DBO5_out_adj, DCO_out, DCO_out_adj, MES_out, MES_out_adj,
    #yields
    NGL_yield, Pt_yield, DBO5_yield, DCO_yield, MES_yield, 
    #nutrient ratios
    N_P_ratio_in, N_P_ratio_out,
    DBO5_N_ratio_in, DBO5_N_ratio_out, DBO5_P_ratio_in, DBO5_P_ratio_out,
    DCO_N_ratio_in, DCO_N_ratio_out, DCO_P_ratio_in, DCO_P_ratio_out,
    DCO_DBO5_ratio_in, DCO_DBO5_ratio_out,
    #PE ratios
    Pt_PE_ratio_in, Pt_PE_ratio_out, NGL_PE_ratio_in, NGL_PE_ratio_out, 
    DBO5_PE_ratio_in, DBO5_PE_ratio_out, DCO_PE_ratio_in, DCO_PE_ratio_out, MES_PE_ratio_in, MES_PE_ratio_out
  )
f_save_csv_files(
  temp,
  path_output,
  "basin_PE_01_artois_picardie.csv"
)
Code
rm(list = ls())