import os
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from ..utils.exceptions import user_warning
try:
from mpl_toolkits.basemap import Basemap
except ImportError:
user_warning('The Basemap package is required to use BreakwaterDatabase')
[docs]class BreakwaterDatabase:
""" Breakwater Database
Import the breakwater database developed by (Allsop et al., 2009)
consist of completed breakwater projects with data ranging from
design wave height to contractor. The constructed breakwaters are
classified by breakwater type, the following types are currently
included: Rubble Mound, Composite, Berm, Caisson and Revetments.
The breakwater database is a separate module of :py:obj:`breakwater`
and can be imported with the following command:
.. code-block:: python
from breakwater.database import BreakwaterDatabase
.. note::
To use this module the :py:obj:`Basemap` package is required,
this dependency is additional to the dependencies mentioned in
Section `2.1 <install.html#Dependencies>`__. See the following
`link <https://matplotlib.org/basemap/users/installing.html>`__
for an installation of :py:obj:`Basemap`
Parameters
----------
update : bool, optional, default: False
if False the data is not updated and the included data is
loaded, if True the database is loaded from the :py:attr:`source`
Attributes
----------
df : pd.DataFrame
DataFrame of the breakwater database
source : str
url of the source
"""
def __init__(self, update=False):
""" See help(BreakwaterDatabase) for more info """
# set source of the database as attribute
self.source = 'http://kennisbank-waterbouw.tudelft.nl/breakwaters/printall.php'
# check if data must be updated from the source
if update:
# updata data from the source
dataframes = pd.read_html(self.source)
# get the correct df from the dfs
for dataframe in dataframes:
# check df of the database by checking the number of columns
if dataframe.shape[1] > 10:
# correct df
df = dataframe
break
# check if headers are correct
if df.columns.dtype != object:
# set first row as header
header = df.iloc[0]
df = df[1:]
df.columns = header
# set id coluimn as index
df.set_index('id', inplace=True)
df.index = df.index.astype(np.int64)
# fix column names of front and rear
names = ['armour', '(unit)', 'size', 'slope']
new_columns = []
# iterate over the columns
for i, column in enumerate(df.columns):
# only columns after front and rear must be changed
if (column == 'front' or column == 'rear'
and pd.isnull(df.columns[i+1:i+4]).any()):
# change names of columns
for name in names:
new_columns.append(f'{column} {name}')
elif pd.isnull(column):
pass
else:
new_columns.append(column)
# update column names
df.columns = new_columns
# convert numeric values to numeric values
df = df.apply(pd.to_numeric, errors='ignore')
# format types
types_fmt = {
'Rubble mound': 'Rubble Mound',
'rubble mound': 'Rubble Mound',
'Composite breakwater': 'Composite',
'composite': 'Composite',
'berm breakwater': 'Berm',
'Berm breakwater': 'Berm',
'Berm Breakwater': 'Berm',
'Caisson breakwater': 'Caisson',
'caisson breakwater': 'Caisson',
'caisson': 'Caisson'}
df.type = df.type.replace(types_fmt)
# fix incorrect classified types
units = ['Tetrapod', 'Antifer', 'Cubes', 'Accropode',
'Accropode II', 'Core-Loc', 'Xbloc', 'XblocPlus',
'Dolos', 'COB', 'Stabit', 'Cubipod']
for unit in units:
df.type = df.type.replace(unit, 'Rubble Mound')
else:
# load data from csv file
file = os.path.dirname(os.path.abspath(__file__))
df = pd.read_csv(f'{file}\\database.csv', index_col=0)
# set df as attribute
self.df = df
# set markers and colors as private attributes
self._markers = {
'Rubble Mound': 'o', 'Caisson': 's', 'Berm': 'D',
'Revetment': '>', 'Composite': '*', 'Unclassified': 'p'}
self._colors = {
'Rubble Mound': '#1f77b4', 'Caisson': '#ff7f0e', 'Berm': '#2ca02c',
'Revetment': '#d62728', 'Composite': '#9467bd',
'Unclassified': '#bcbd22'}
@staticmethod
def _del_zeros(df):
""" Delete rows containing a zero """
# make list to store ids to delete
to_delete = []
# iterate over the df
for id, row in df.iterrows():
# iterate over the row
for param, value in row.iteritems():
# check if value is zero
if value == 0:
# row must be deleted
to_delete.append(id)
break
# delete rows and return df
return df.drop(labels=to_delete, axis=0)
@staticmethod
def _show_unclassified(show, df):
""" Helper method to filter the df for unclassified bw types """
# check if unclassified must be plotted
if show:
# replace nan with unclassified so label is correct in plot
df.type = df.type.replace(np.nan, 'Unclassified')
else:
# remove unclassified breakwaters
df.dropna(subset=['type'], inplace=True)
return df
@staticmethod
def _validate_excludes(input):
""" Verify the input for the correct type """
# check input of excludes
if input is not None and not isinstance(input, list):
# raise TypeError for incorrect type of exclude
raise TypeError(
('Types to exclude must be given in a list, not as '
f'{type(input).__name__}'))
@staticmethod
def _validate_all_excludes(input):
""" Validate if specified types have not been excluded """
if input:
# not all given types were in the database,
# and were thus not excluded, therefore show warning
not_excluded = ', '.join(input)
user_warning(
('The following types in exclude are not in the database: '
f'{not_excluded}'))
@property
def unclassified(self):
""" Get the number of unclassified breakwaters """
# get the unclassified breakwaters
return self.df.type.isna().sum()
[docs] def report(self, save=True, save_path='data_report.xlsx', decimals=3):
""" Make a report of the data in the database
Method to generate a data report of the database. For each
breakwater type the total and missing number of datapoints is
determined. Furthermore, for numerical values the mean and
standard deviation is also computed.
.. note:
Note that the mean and standard deviation is not computed
for the following parameters: start, finish, front size,
rear size, latitude and longitude
Parameters
----------
save : bool, optional, default: True
If True an Excel version of the data report is generated,
use :py:obj:`save_path` to specify a save path
save_path : str
File path to save the Excel file to
decimals : int, optional, default: 3
Number of decimal places to round to (default: 0). If
decimals is negative, it specifies the number of
positions to the left of the decimal point.
Returns
-------
pd.DataFrame
if :py:obj:`save` is False a DataFrame of the report is
returned
"""
# create df to store data report in
data_report = pd.DataFrame()
# drop unclassified breakwaters
df = self._show_unclassified(False, self.df)
# define columns of which the mean and standard deviation does have
# to be computed
no_computations_required = [
'start', 'finish', 'front size', 'rear size', 'Lat', 'Lon']
# make list for headers and subheaders
headers, subheaders = ['Type'], ['']
# set bool to track if subheaders have been added for wall column
add_subheaders_wall = True
# iterate over the bw types
for i, bw_type in enumerate(df.type.unique()):
# get only the bws of the current type
bws = df[df.type == bw_type]
# create dict with bw_type to store data
type_data = {'type': [bw_type]}
# iterate over the columns with dtype (for computing mean, etc)
for column, dtype in bws.dtypes.iteritems():
# add name of the column as a header if first iteration
if i == 0 and column != 'type':
headers.append(column)
# check if column has the bw_type
if column == 'type':
# pass as this has already been added
continue
# check type of the column
elif dtype == np.object:
# column contains strings or is a coordinate
# count total number of datapoints
total = len(bws[column].values)
# check if column contains the slope
if 'slope' in column:
# check bw type
if bw_type == 'Caisson' or bw_type == 'Composite':
# have slope 1:0 but this is not needed since
# these are vertical structures
type_data[column] = ['-']
# add subheader if first iteration
if i == 0:
subheaders.append('')
continue
else:
# slope 1:0 is specified when the slope unknown
# so replace these values with nan
column_data = bws[column].replace('1:0', np.nan)
missing = column_data.isna().sum()
else:
# count missing datapoints
missing = bws[column].isna().sum()
# add to column
type_data[column] = [f'{total-missing}/{total}']
# add subheader if first iteration
if i == 0:
subheaders.append('')
elif column in no_computations_required:
# replace zeros for nan
column_data = bws[column].replace(0, np.nan)
# compute total and missing datapoints
total = len(bws[column].values)
missing = column_data.isna().sum()
# add to column
type_data[column] = [f'{total-missing}/{total}']
# add subheader if first iteration
if i == 0:
subheaders.append('')
else:
# column is an int or float
# check if bw type is not caisson and column is wall
if column == 'wall' and bw_type != 'Caisson':
# wall is not a parameter of other structures
type_data[f'{column} no'] = ['-']
type_data[f'{column} comp'] = ['-']
# add subheader if first iteration
if add_subheaders_wall:
headers.append(column)
subheaders.append('')
subheaders.append('')
add_subheaders_wall = False
continue
# replace zeros for nan
column_data = bws[column].replace(0, np.nan)
# compute total and missing datapoints
total = len(bws[column].values)
missing = column_data.isna().sum()
# compute average and standard deviation
mean = column_data.mean()
std = column_data.std()
# add data to dict
type_data[f'{column} no'] = [f'{total-missing}/{total}']
type_data[f'{column} comp'] = [f'{mean}±{std}']
# append extra column to headers and set subheaders
if i == 0:
headers.append(column)
subheaders.append('datapoints')
subheaders.append('μ ± σ')
# check if column is wall for setting the bool
if column == 'wall':
# set to False because has been added here
add_subheaders_wall = False
# add dict to df
temp_df = pd.DataFrame(data=type_data)
data_report = data_report.append(
temp_df, ignore_index=True, sort=False)
# update columns of the df with double row of headers
data_report.columns = [headers, subheaders]
# check if df must be saved
if save:
# save df
data_report.to_excel(save_path)
else:
# return df
return data_report
[docs] def correlation(self, param1, param2, bw_type=None, method='pearson'):
""" Compute the correlation between two parameters
Parameters
----------
param1 : str
name of parameter 1
param2 : str
name of parameter 2
bw_type : str, optional, default: None
if specified only the values of the given bw_type are
considered
method : {pearson, spearman}, optional, default: pearson
method of correlation
Returns
-------
tuple
correlation between the two parameters and the p-value
"""
# check if bw_type is given
if bw_type is not None:
# only for bw_type
pass
else:
# for all bw types
# get the two columns and remove zeros
filtered_df = self._del_zeros(self.df[[param1, param2]])
# method of correlation
if method == 'pearson':
# pearson method
corr = stats.pearsonr(
filtered_df[param1].values, filtered_df[param2].values)
elif method == 'spearman':
# spearman method
corr = stats.spearmanr(
filtered_df[param1].values, filtered_df[param2].values)
else:
raise NotImplementedError(
f'{method} is not supported, must be pearson or spearman')
return corr
[docs] def cross_section(self, id, B=None, Rc=None, h=None, slope=None):
""" Plot a cross section of the breakwater
Method to plot a cross section of a breakwater in the database.
The breakwater is selected by the id of the breakwater. In case
data is missing to plot the breakwater it is possible to specify
these as arguments.
.. warning::
plot function currently only supports Rubble Mound and
caisson breakwaters
Parameters
----------
id : int
id of the breakwater to plot
B : float, optional, default: None
specify custom crest width
Rc : float, optional, default: None
specify custom crest height
h : float, optional, default: None
specify a custom water level
slope : tuple, optional, default: None
specify a custom slope, must be specified as a tuple (V, H)
Raises
------
ValueError
If data required to plot a cross section is missing
"""
# set custom slope if not specified for protecting _validate
if slope is None:
slope = (0,0)
# get the data of the bw
bw = self.df[self.df.index == id]
type = bw.type.values[0]
# get hydraulic data
depth = _validate_plot_vals('depth', bw['depth'].values[0], h)
# get geometric data
freeboard = _validate_plot_vals('Rc', bw['heigth'].values[0], Rc)
width = _validate_plot_vals('B', bw['width'].values[0], B)
# set infobox with general info
cost = bw['cost(M$)'].values[0]
infobox = '\n'.join((
f'constructed between {bw.start.values[0]} and {bw.finish.values[0]}',
f'cost = {cost} M$',
f'owner = {bw.owner.values[0]}',
f'contractor = {bw.contractor.values[0]}',
f'consultant = {bw.consultant.values[0]}',
f'Hs = {bw.Hs.values[0]} m',
f'Tz = {bw.Tz.values[0]} s',
f'Tp = {bw.Tp.values[0]} s',
f'Rc = {freeboard} m',
f'h = {depth} m',))
# check type
if type == 'Rubble Mound':
# get the slope of the bw
slope_databasse = bw['front slope'].values[0].split(':')
V = _validate_plot_vals(
'slope', float(slope_databasse[0]), slope[0])
H = _validate_plot_vals(
'slope', float(slope_databasse[1]), slope[1])
# compute coordinates
x1 = H*(depth+freeboard)/V
xwlev = H*freeboard/V
x = [-x1-0.5*width, -0.5*width, 0.5*width, x1+0.5*width]
y = [0, depth+freeboard, depth+freeboard, 0]
# set additional width for xmin and xmax
increase_x = 0
# add extra info of rubble mound
armour = bw['front armour'].values[0]
size = bw['front size'].values[0]
unit = bw['front (unit)'].values[0]
infobox = '\n'.join((
infobox,
f'armour = {armour} of {size} {unit}',
f'slope = {V}:{H}'))
elif type == 'Caisson':
# compute coordinates
xwlev = 0
x = [-0.5*width, -0.5*width, 0.5*width, 0.5*width]
y = [0, depth+freeboard, depth+freeboard, 0]
# set additional width for xmin and xmax
increase_x = 2*width
else:
raise NotImplementedError(
f'{type} is currently not supported for plotting')
# create figure
fig, ax = plt.subplots(figsize=(10,5))
# plot bw
ax.plot(x,y, color='k', lw=1)
# get the xmin and xmax
xmin = ax.get_xlim()[0]*1.2 - increase_x
xmax = ax.get_xlim()[1]*1.2 + increase_x
# plot bottom and wlev (left and right)
ax.axhline(y=0, color='peru', lw=1, zorder=5)
ax.hlines(
y=depth, xmin=xmin, xmax=-xwlev-0.5*width, color='dodgerblue', lw=1)
ax.hlines(
y=depth, xmin=xwlev+0.5*width, xmax=xmax, color='dodgerblue', lw=1)
# place the info box with all info
props = dict(boxstyle='round', facecolor='whitesmoke', alpha=0.5)
ax.text(
1.02, 0.99, infobox, transform=ax.transAxes, fontsize=9,
verticalalignment='top', bbox=props)
# format the figure
ax.set_xlim(xmin, xmax)
plt.title(
(f'Cross section of the {type.lower()} breakwater at '
f'{bw.harbour.values[0]}, {bw.country.values[0]}'))
ax.set_aspect('equal', adjustable='box')
ax.grid()
fig.tight_layout()
plt.show()
[docs] def map(
self, area=[], resolution='c', show_unclassified=False,
exclude=None):
""" Plot the breakwaters on a world map
Method to plot all breakwaters with coordinates on a map of
the world, or part of the world if an area is specified. Method
uses :py:obj:`Basemap` to generate the map.
Parameters
----------
area : list, optional, default: []
specify the coordinates of the area to plot. Use following
format [llcrnrlon, llcrnrlat, urcrnrlon, urcrnrlat]
resolution : str, optional, default: c
resolution of the map to use. Can be c (crude), l (low),
i (intermediate), h (high), f (full).
show_unclassified : bool, optional, default: False
True is unclassified breakwaters with a coordinate must be
plotted, False is unclassified breakwaters must not be
plotted.
exclude : list, optional, default: None
list of breakwater types to exclude from the plot
"""
# validate exclude input for type
self._validate_excludes(exclude)
# create the figure
plt.figure(figsize=(16,12))
# check if an area is specified
if not area:
# plot full map
m = Basemap(
lat_0=0, lon_0=0, projection='robin', resolution=resolution)
size = 7
else:
# plot the specified area
m = Basemap(
llcrnrlon=area[0], llcrnrlat=area[1], urcrnrlon=area[2],
urcrnrlat=area[3], resolution=resolution)
size = 15
# edit lay-out of the map
m.drawcountries(color='#d7d7d7')
m.drawmapboundary(fill_color='#D0CFD4', linewidth=0)
m.fillcontinents(color='#EFEFEF', lake_color='#D0CFD4')
# remove bw's without a Lon and\or Lat
filtered_df = self._del_zeros(self.df[['type', 'Lon', 'Lat']])
# filter df for showing unclassified bw types
filtered_df = self._show_unclassified(show_unclassified, filtered_df)
# iterate over the bw types to plot them
for i, bw_type in enumerate(filtered_df.type.unique()):
# check if type is in exclude
if exclude is not None and bw_type in exclude:
# pass and delete type from exclude
exclude.remove(bw_type)
else:
# get only the bws of the current type
bws = filtered_df[filtered_df.type == bw_type]
# plot the bws
xpt, ypt = m(bws.Lon.values, bws.Lat.values)
m.scatter(
xpt, ypt, s=size, alpha=1, label=bw_type,
c=self._colors.get(bw_type, '#17becf'),
marker=self._markers.get(bw_type, 'H'), zorder=2)
# check if all types in exclude are in the database
self._validate_all_excludes(exclude)
# add legend, set tight_layout and show plot
plt.legend(loc=1)
plt.tight_layout()
plt.show()
[docs] def scatter(
self, param1, param2, show_unclassified=False, exclude=None,
min_data=5, xmax=None, ymax=None, bins_param1=10, bins_param2=10):
""" Make a scatter plot of two parameters
Method to generate a scatter plot with histograms for two
parameters in the database.
Parameters
----------
param1 : str
name of parameter 1
param2 : str
name of parameter 2
show_unclassified : bool, optional, default: False
True is unclassified breakwaters with a coordinate must be
plotted, False is unclassified breakwaters must not be
plotted.
exclude : list, optional, default: None
list of breakwater types to exclude from the plot
min_data : int, optional, default: 5
minimum number of datapoints required for plotting, if the
data for a bw type is less than this limit it will be skipped
xmax : float, optional, default: None
maximum x coordinate of the scatter plot, by default this
limit is automatically determined
ymax : float, optional, default: None
maximum y coordinate of the scatter plot, by default this
limit is automatically determined
bins_param1 : str
number of bins for param2
bins_param2 : str
number of bins for param2
"""
# validate exclude input for type
self._validate_excludes(exclude)
# filter the df to remove zeros
filtered_df = self._del_zeros(self.df[['type', param1, param2]])
# filter df for showing unclassified bw types
filtered_df = self._show_unclassified(show_unclassified, filtered_df)
# create the figure
fig = plt.figure(figsize=(12,9))
gs = GridSpec(4,4)
# create the scatter plot
scatter_plot = fig.add_subplot(gs[1:4,0:3])
# make the histograms of the two parameters
top_hist = fig.add_subplot(gs[0,0:3])
right_hist = fig.add_subplot(gs[1:4,3])
# create empty list for storing data of bw types
hist_param1, hist_param2 = [], []
# create empty list to store colors for histogram
colors = []
# iterate over the bw types
for bw_type in filtered_df.type.unique():
# check if type is in exclude
if exclude is not None and bw_type in exclude:
# pass and delete type from exclude
exclude.remove(bw_type)
else:
# get only the bws of the current type
bws = filtered_df[filtered_df.type == bw_type]
# check if there is enough data
if (len(bws[param1].values) <= min_data
or len(bws[param2].values) <= min_data):
# not enough data, show warning
user_warning(
f'{bw_type} was skipped because of a lack of data')
else:
# enough data, plot on scatter
scatter_plot.scatter(
bws[param1].values, bws[param2].values, s=18,
c=self._colors.get(bw_type, '#17becf'), label=bw_type,
marker=self._markers.get(bw_type, 'H'), zorder=2)
# add used color to the list
colors.append(self._colors.get(bw_type, '#17becf'))
# add to hist lists
hist_param1.append(bws[param1].values)
hist_param2.append(bws[param2].values)
# check if all types in exclude are in the database
self._validate_all_excludes(exclude)
# determine x and y limits, if not specified as arguments
if xmax is None:
# determine xmax
xmax = np.round(scatter_plot.get_xlim()[1])
if ymax is None:
# determine ymax
ymax = np.round(scatter_plot.get_ylim()[1])
# generate the bins
param1_bins = np.linspace(0, xmax, bins_param1)
param2_bins = np.linspace(0, ymax, bins_param2)
# add the histograms to the plot
top_hist.hist(hist_param1, bins=param1_bins, color=colors)
right_hist.hist(
hist_param2, bins=bins_param2, orientation='horizontal',
color=colors)
# set x and y lims
scatter_plot.set_xlim(0, xmax)
scatter_plot.set_ylim(0, ymax)
top_hist.set_xlim(0, xmax)
right_hist.set_ylim(0, ymax)
# add grid to all plots
scatter_plot.grid()
top_hist.grid()
right_hist.grid()
# set labels
scatter_plot.set_xlabel(param1.capitalize())
scatter_plot.set_ylabel(param2.capitalize())
top_hist.set_ylabel('Frequency')
right_hist.set_xlabel('Frequency')
# remove ticks from histograms
plt.setp(top_hist.get_xticklabels(), visible=False)
plt.setp(right_hist.get_yticklabels(), visible=False)
# add legend, set tight_layout and show plot
scatter_plot.legend()
plt.tight_layout()
plt.show()
[docs] def hist(
self, param, show_unclassified=False, exclude=None, min_data=5,
xmax=None, bins=10):
""" Plot a histogram of a parameter
Parameters
----------
param : str
name of the parameter
show_unclassified : bool, optional, default: False
True is unclassified breakwaters with a coordinate must be
plotted, False is unclassified breakwaters must not be
plotted.
exclude : list, optional, default: None
list of breakwater types to exclude from the plot
min_data : int, optional, default: 5
minimum number of datapoints required for plotting, if the
data for a bw type is less than this limit it will be skipped
xmax : float, optional, default: None
maximum x coordinate of the histogram, by default this
limit is automatically determined
bins : int, optional, default: 10
number of bins
"""
# validate exclude input for type
self._validate_excludes(exclude)
# remove the zero values from the df
df = self.df[['type', param]]
df = df[df[param] != 0]
# filter df for showing unclassified bw types
filtered_df = self._show_unclassified(show_unclassified, df)
# create lists to store labels, data and colors
labels, data, colors = [], [], []
# set variable to track maximum x
xmax_computed = 0
# iterate over the types in the df to get the data
for bw_type in filtered_df.type.unique():
# check if type is in exclude
if exclude is not None and bw_type in exclude:
# pass and delete type from exclude
exclude.remove(bw_type)
else:
# get only the bws of the current type
bws = filtered_df[filtered_df.type == bw_type]
# check if there is enough data
if len(bws[param].values) <= min_data:
# not enough data, show warning
user_warning(
f'{bw_type} was skipped because of a lack of data')
else:
# enough data, plot on scatter
# add used color, data and label to the list
labels.append(bw_type)
data.append(bws[param].values)
colors.append(self._colors.get(bw_type, '#17becf'))
# get maximum value
if np.max(bws[param].values) >= xmax_computed:
xmax_computed = np.max(bws[param].values)
# check if all types in exclude are in the database
self._validate_all_excludes(exclude)
# determine x limit, if not specified as arguments
if xmax is None:
# determine xmax
xmax = np.round(xmax_computed)
# generate the bins
generated_bins = np.linspace(0, xmax, bins)
# plot histogram
plt.hist(data, bins=generated_bins, label=labels, color=colors)
# format axis
plt.xlim(0, xmax)
plt.xlabel(f'{param}')
plt.ylabel('Frequency')
# set other lay-out
plt.title(f'Frequency Histogram of {param}')
plt.grid()
plt.legend()
plt.tight_layout()
plt.show()
def _validate_plot_vals(param, val, given_val):
""" Validate the values to plot """
if val == 0:
# check if val is given
if given_val != 0:
# return specified value
return given_val
else:
# raise error
raise ValueError(
(f'No value for {param} in the database, use arguments to '
'specify custom value'))
else:
# return value
return val