Check example#
Schema example#
import pandas as pd
from pandera import Check, Column, DataFrameSchema, Index, dtypes
from pandera.engines import pandas_engine
from pandera.typing import Series
# pylint: disable=too-many-lines
@pandas_engine.Engine.register_dtype(
equivalents=["boolean", pd.BooleanDtype, pd.BooleanDtype()],
)
@dtypes.immutable # step 2
class LiteralBool(pandas_engine.BOOL):
def coerce( # pylint: disable=arguments-renamed
self, series: pd.Series
) -> pd.Series:
"""Coerce a pandas.Series to boolean types."""
if pd.api.types.is_string_dtype(series):
series = series.replace({"True": 1, "False": 0})
return series.astype("boolean")
def check_num_finess_format(num_finess_et: Series[str]) -> Series[bool]:
"""
Finess identifiers are 9 characters wide (alphanumerical)
"""
return num_finess_et.str.match("^\\w{9}$")
def check_dataframe_coherence(data_df):
"""
Dummy check to test dataframe wide checks, defined at the Schema level
"""
return data_df.notna(axis=1).any()
Evaluations = DataFrameSchema(
columns={
"num_finess_et": Column(
dtype="string",
checks=[Check(check_num_finess_format)],
drop_invalid_rows=False, # Contre-intuitif : Les lignes invalides seront supprimées # Arrivé 1 fois : GRI-xxxxxx
nullable=False,
unique=False,
coerce=True,
required=True,
regex=False,
description="Geographic FINESS Identifier (ex: 920000650)",
title="Geographic FINESS Identifier",
),
"eval_code": Column(
dtype="string",
checks=Check.str_matches(r"^EVAL\-[\d]{1,6}"),
drop_invalid_rows=False, # Contre-intuitif : Les lignes invalides seront supprimées # Arrivé 1 fois : GRI-xxxxxx
nullable=False,
unique=False,
coerce=True,
required=True,
regex=False,
description=None,
title=None,
),
"eval_titre": Column(
dtype="string",
checks=None,
drop_invalid_rows=True, # Contre-intuitif : Les lignes invalides soulèveront une erreur
nullable=False,
unique=False,
coerce=True,
required=True,
regex=False,
description=None,
title=None,
),
"eval_statut_code": Column(
dtype=pd.CategoricalDtype(categories=["Resolved-Completed"]),
checks=None,
drop_invalid_rows=True, # Contre-intuitif : Les lignes invalides soulèveront une erreur
nullable=False,
unique=False,
coerce=True,
required=True,
regex=False,
description=None,
title=None,
),
"eval_statut_label": Column(
dtype=pd.CategoricalDtype(categories=["Clôturée"]),
checks=None,
drop_invalid_rows=True, # Contre-intuitif : Les lignes invalides soulèveront une erreur
nullable=False,
unique=False,
coerce=True,
required=True,
regex=False,
description=None,
title=None,
),
"eval_date_debut": Column(
dtype=pandas_engine.DateTime( # type: ignore[call-arg] # pylint: disable=no-value-for-parameter,unexpected-keyword-arg
to_datetime_kwargs={"format": "%Y%m%d"}
),
checks=None,
drop_invalid_rows=True, # Contre-intuitif : Les lignes invalides soulèveront une erreur
nullable=False,
unique=False,
coerce=True,
required=True,
regex=False,
description=None,
title=None,
),
"eval_date_fin": Column(
dtype=pandas_engine.DateTime( # type: ignore[call-arg] # pylint: disable=no-value-for-parameter,unexpected-keyword-arg
to_datetime_kwargs={"format": "%Y%m%d"}
),
checks=None,
drop_invalid_rows=True, # Contre-intuitif : Les lignes invalides soulèveront une erreur
nullable=False,
unique=False,
coerce=True,
required=True,
regex=False,
description=None,
title=None,
),
"eval_date_cloture_tech": Column(
dtype=pandas_engine.DateTime( # type: ignore[call-arg] # pylint: disable=no-value-for-parameter,unexpected-keyword-arg
to_datetime_kwargs={"format": "%Y%m%dT%H%M%S.%f GMT"}
),
checks=None,
drop_invalid_rows=True, # Contre-intuitif : Les lignes invalides soulèveront une erreur
nullable=False,
unique=False,
coerce=True,
required=True,
regex=False,
description=None,
title=None,
),
"etablissement": Column(
dtype="string",
checks=None,
drop_invalid_rows=True, # Contre-intuitif : Les lignes invalides soulèveront une erreur
nullable=False,
unique=False,
coerce=False,
required=True,
regex=False,
description=None,
title=None,
),
"mission_code": Column(
dtype="string",
checks=Check.str_matches(r"^MISSION\-[\d]{2,6}"),
drop_invalid_rows=True, # Contre-intuitif : Les lignes invalides soulèveront une erreur
nullable=False,
unique=False,
coerce=False,
required=True,
regex=False,
description=None,
title=None,
),
"oe_code": Column(
dtype="string",
checks=None,
drop_invalid_rows=True, # Contre-intuitif : Les lignes invalides soulèveront une erreur
nullable=True,
unique=False,
coerce=True,
required=True,
regex=False,
description=None,
title=None,
),
},
checks=[check_dataframe_coherence],
drop_invalid_rows=True,
index=Index(
dtype="int64",
checks=None,
nullable=False,
coerce=False,
name=None,
description=None,
title=None,
),
dtype=None,
coerce=True,
strict=False,
name=None,
ordered=False,
unique=None,
report_duplicates="all",
unique_column_names=False,
add_missing_columns=False,
title=None,
description=None,
)
- pandera schema target.check_schema.Evaluations#
- Schema Configuration:
coerce = True
ordered = False
strict = False
- column num_finess_et: string[python], Geographic FINESS Identifier#
Geographic FINESS Identifier (ex: 920000650)
- Constraints:
nullable = False
unique = False
coerce = True
required = True
- Validated by:
- column eval_code: string[python]#
- Constraints:
nullable = False
unique = False
coerce = True
required = True
- Validated by:
str_matches(‘^EVAL-[d]{1,6}’)
- column eval_titre: string[python]#
- Constraints:
nullable = False
unique = False
coerce = True
required = True
- column eval_statut_code: category#
- Constraints:
nullable = False
unique = False
coerce = True
required = True
- column eval_statut_label: category#
- Constraints:
nullable = False
unique = False
coerce = True
required = True
- column eval_date_debut: datetime64[ns]#
- Constraints:
nullable = False
unique = False
coerce = True
required = True
- column eval_date_fin: datetime64[ns]#
- Constraints:
nullable = False
unique = False
coerce = True
required = True
- column eval_date_cloture_tech: datetime64[ns]#
- Constraints:
nullable = False
unique = False
coerce = True
required = True
- column etablissement: string[python]#
- Constraints:
nullable = False
unique = False
coerce = False
required = True
- column mission_code: string[python]#
- Constraints:
nullable = False
unique = False
coerce = False
required = True
- Validated by:
str_matches(‘^MISSION-[d]{2,6}’)
- column oe_code: string[python]#
- Constraints:
nullable = True
unique = False
coerce = True
required = True
.. autopandera_schema:: target.check_schema.Evaluations
NB: If you want to use markdown with myst-parser, use the eval-rst directive.
Model example#
import pandas as pd
import pandera as pa
from pandera.engines.pandas_engine import DateTime
from pandera.typing import Series
# pylint: disable-next=unexpected-keyword-arg,no-value-for-parameter
Date: DateTime = DateTime(unit="D", to_datetime_kwargs={"format": "%Y-%m-%d"}) # type: ignore
class TestModel(pa.DataFrameModel):
"""
Data model with checks
"""
# pylint: disable=too-few-public-methods,no-self-argument
class Config:
strict = True
coerce = True
date_export: Series[Date] = pa.Field( # type: ignore
title="Export date",
description=(
"Date of the export, exports are made available on a yearly basis"
),
coerce=True,
)
num_finess_et: Series[str] = pa.Field(
title="Geographic FINESS Identifier",
description="Geographic FINESS Identifier (ex: 920000650)",
)
num_finess_ej: Series[str] = pa.Field(
title="Juridic FINESS Identifier",
description="Identifider of the juridic entity (ex: 920150059)",
)
latitude: Series[float] = pa.Field(
title="Latitude",
description=(
"Latitude of the location of the care center"
"(WGS 84) (ex: 48.84512493935407)"
),
nullable=True,
le=90,
ge=-90,
)
longitude: Series[float] = pa.Field(
title="Longitude",
description=(
"Longitude of the location of the care center"
"(WGS 84) (ex: 48.84512493935407)"
),
nullable=True,
le=180,
ge=-180,
)
@pa.check("num_finess_e.", regex=True, name="check_num_finess_format")
def check_num_finess_format(
cls, num_finess_et: Series[str]
) -> Series[bool]:
"""
Finess identifiers are 9 characters wide (alphanumerical)
"""
return num_finess_et.str.match("^\\w{9}$")
@pa.dataframe_check
def check_coords_non_null(cls, data_df: pd.DataFrame) -> Series[bool]:
"""
Longitude and latitude should not be null starting 2017
"""
return (
(data_df["date_export"].dt.year > 2017)
& data_df["latitude"].notna()
& data_df["longitude"].notna()
) | (data_df["date_export"].dt.year <= 2017)
- pandera model target.check_model.TestModel[source]#
Data model with checks
- column date_export: Series[DataType(datetime64[ns])], Export date#
Date of the export, exports are made available on a yearly basis
- Constraints:
nullable = False
unique = False
coerce = True
required = True
- column num_finess_et: Series[str], Geographic FINESS Identifier#
Geographic FINESS Identifier (ex: 920000650)
- Constraints:
nullable = False
unique = False
coerce = False
required = True
- Validated by:
- column num_finess_ej: Series[str], Juridic FINESS Identifier#
Identifider of the juridic entity (ex: 920150059)
- Constraints:
nullable = False
unique = False
coerce = False
required = True
- Validated by:
- column latitude: Series[float], Latitude#
Latitude of the location of the care center(WGS 84) (ex: 48.84512493935407)
- Constraints:
nullable = True
unique = False
coerce = False
required = True
- Validated by:
greater_than_or_equal_to(-90)
less_than_or_equal_to(90)
- column longitude: Series[float], Longitude#
Longitude of the location of the care center(WGS 84) (ex: 48.84512493935407)
- Constraints:
nullable = True
unique = False
coerce = False
required = True
- Validated by:
greater_than_or_equal_to(-180)
less_than_or_equal_to(180)
.. autopandera_model:: target.check_model.TestModel
NB: If you want to use markdown with myst-parser, use the eval-rst directive.