Check example#

Schema example#

import pandas as pd
from pandera import Check, Column, DataFrameSchema, Index, dtypes
from pandera.engines import pandas_engine
from pandera.typing import Series


# pylint: disable=too-many-lines
@pandas_engine.Engine.register_dtype(
    equivalents=["boolean", pd.BooleanDtype, pd.BooleanDtype()],
)
@dtypes.immutable  # step 2
class LiteralBool(pandas_engine.BOOL):
    def coerce(  # pylint: disable=arguments-renamed
        self, series: pd.Series
    ) -> pd.Series:
        """Coerce a pandas.Series to boolean types."""
        if pd.api.types.is_string_dtype(series):
            series = series.replace({"True": 1, "False": 0})
        return series.astype("boolean")


def check_num_finess_format(num_finess_et: Series[str]) -> Series[bool]:
    """
    Finess identifiers are 9 characters wide (alphanumerical)
    """
    return num_finess_et.str.match("^\\w{9}$")


def check_dataframe_coherence(data_df):
    """
    Dummy check to test dataframe wide checks, defined at the Schema level
    """
    return data_df.notna(axis=1).any()


Evaluations = DataFrameSchema(
    columns={
        "num_finess_et": Column(
            dtype="string",
            checks=[Check(check_num_finess_format)],
            drop_invalid_rows=False,  # Contre-intuitif : Les lignes invalides seront supprimées # Arrivé 1 fois : GRI-xxxxxx
            nullable=False,
            unique=False,
            coerce=True,
            required=True,
            regex=False,
            description="Geographic FINESS Identifier (ex: 920000650)",
            title="Geographic FINESS Identifier",
        ),
        "eval_code": Column(
            dtype="string",
            checks=Check.str_matches(r"^EVAL\-[\d]{1,6}"),
            drop_invalid_rows=False,  # Contre-intuitif : Les lignes invalides seront supprimées # Arrivé 1 fois : GRI-xxxxxx
            nullable=False,
            unique=False,
            coerce=True,
            required=True,
            regex=False,
            description=None,
            title=None,
        ),
        "eval_titre": Column(
            dtype="string",
            checks=None,
            drop_invalid_rows=True,  # Contre-intuitif : Les lignes invalides soulèveront une erreur
            nullable=False,
            unique=False,
            coerce=True,
            required=True,
            regex=False,
            description=None,
            title=None,
        ),
        "eval_statut_code": Column(
            dtype=pd.CategoricalDtype(categories=["Resolved-Completed"]),
            checks=None,
            drop_invalid_rows=True,  # Contre-intuitif : Les lignes invalides soulèveront une erreur
            nullable=False,
            unique=False,
            coerce=True,
            required=True,
            regex=False,
            description=None,
            title=None,
        ),
        "eval_statut_label": Column(
            dtype=pd.CategoricalDtype(categories=["Clôturée"]),
            checks=None,
            drop_invalid_rows=True,  # Contre-intuitif : Les lignes invalides soulèveront une erreur
            nullable=False,
            unique=False,
            coerce=True,
            required=True,
            regex=False,
            description=None,
            title=None,
        ),
        "eval_date_debut": Column(
            dtype=pandas_engine.DateTime(  # type: ignore[call-arg]  # pylint: disable=no-value-for-parameter,unexpected-keyword-arg
                to_datetime_kwargs={"format": "%Y%m%d"}
            ),
            checks=None,
            drop_invalid_rows=True,  # Contre-intuitif : Les lignes invalides soulèveront une erreur
            nullable=False,
            unique=False,
            coerce=True,
            required=True,
            regex=False,
            description=None,
            title=None,
        ),
        "eval_date_fin": Column(
            dtype=pandas_engine.DateTime(  # type: ignore[call-arg] # pylint: disable=no-value-for-parameter,unexpected-keyword-arg
                to_datetime_kwargs={"format": "%Y%m%d"}
            ),
            checks=None,
            drop_invalid_rows=True,  # Contre-intuitif : Les lignes invalides soulèveront une erreur
            nullable=False,
            unique=False,
            coerce=True,
            required=True,
            regex=False,
            description=None,
            title=None,
        ),
        "eval_date_cloture_tech": Column(
            dtype=pandas_engine.DateTime(  # type: ignore[call-arg] # pylint: disable=no-value-for-parameter,unexpected-keyword-arg
                to_datetime_kwargs={"format": "%Y%m%dT%H%M%S.%f GMT"}
            ),
            checks=None,
            drop_invalid_rows=True,  # Contre-intuitif : Les lignes invalides soulèveront une erreur
            nullable=False,
            unique=False,
            coerce=True,
            required=True,
            regex=False,
            description=None,
            title=None,
        ),
        "etablissement": Column(
            dtype="string",
            checks=None,
            drop_invalid_rows=True,  # Contre-intuitif : Les lignes invalides soulèveront une erreur
            nullable=False,
            unique=False,
            coerce=False,
            required=True,
            regex=False,
            description=None,
            title=None,
        ),
        "mission_code": Column(
            dtype="string",
            checks=Check.str_matches(r"^MISSION\-[\d]{2,6}"),
            drop_invalid_rows=True,  # Contre-intuitif : Les lignes invalides soulèveront une erreur
            nullable=False,
            unique=False,
            coerce=False,
            required=True,
            regex=False,
            description=None,
            title=None,
        ),
        "oe_code": Column(
            dtype="string",
            checks=None,
            drop_invalid_rows=True,  # Contre-intuitif : Les lignes invalides soulèveront une erreur
            nullable=True,
            unique=False,
            coerce=True,
            required=True,
            regex=False,
            description=None,
            title=None,
        ),
    },
    checks=[check_dataframe_coherence],
    drop_invalid_rows=True,
    index=Index(
        dtype="int64",
        checks=None,
        nullable=False,
        coerce=False,
        name=None,
        description=None,
        title=None,
    ),
    dtype=None,
    coerce=True,
    strict=False,
    name=None,
    ordered=False,
    unique=None,
    report_duplicates="all",
    unique_column_names=False,
    add_missing_columns=False,
    title=None,
    description=None,
)
pandera schema target.check_schema.Evaluations#
Schema Configuration:
  • coerce = True

  • ordered = False

  • strict = False

column num_finess_et: string[python], Geographic FINESS Identifier#

Geographic FINESS Identifier (ex: 920000650)

Constraints:
  • nullable = False

  • unique = False

  • coerce = True

  • required = True

Validated by:
column eval_code: string[python]#
Constraints:
  • nullable = False

  • unique = False

  • coerce = True

  • required = True

Validated by:
  • str_matches(‘^EVAL-[d]{1,6}’)

column eval_titre: string[python]#
Constraints:
  • nullable = False

  • unique = False

  • coerce = True

  • required = True

column eval_statut_code: category#
Constraints:
  • nullable = False

  • unique = False

  • coerce = True

  • required = True

column eval_statut_label: category#
Constraints:
  • nullable = False

  • unique = False

  • coerce = True

  • required = True

column eval_date_debut: datetime64[ns]#
Constraints:
  • nullable = False

  • unique = False

  • coerce = True

  • required = True

column eval_date_fin: datetime64[ns]#
Constraints:
  • nullable = False

  • unique = False

  • coerce = True

  • required = True

column eval_date_cloture_tech: datetime64[ns]#
Constraints:
  • nullable = False

  • unique = False

  • coerce = True

  • required = True

column etablissement: string[python]#
Constraints:
  • nullable = False

  • unique = False

  • coerce = False

  • required = True

column mission_code: string[python]#
Constraints:
  • nullable = False

  • unique = False

  • coerce = False

  • required = True

Validated by:
  • str_matches(‘^MISSION-[d]{2,6}’)

column oe_code: string[python]#
Constraints:
  • nullable = True

  • unique = False

  • coerce = True

  • required = True

check target.check_schema.check_num_finess_format[source]#

Finess identifiers are 9 characters wide (alphanumerical)

Validates:
  • num_finess_et

check target.check_schema.check_dataframe_coherence[source]#

Dummy check to test dataframe wide checks, defined at the Schema level

.. autopandera_schema:: target.check_schema.Evaluations

NB: If you want to use markdown with myst-parser, use the eval-rst directive.

Model example#

import pandas as pd
import pandera as pa
from pandera.engines.pandas_engine import DateTime
from pandera.typing import Series

# pylint: disable-next=unexpected-keyword-arg,no-value-for-parameter
Date: DateTime = DateTime(unit="D", to_datetime_kwargs={"format": "%Y-%m-%d"})  # type: ignore


class TestModel(pa.DataFrameModel):
    """
    Data model with checks
    """

    # pylint: disable=too-few-public-methods,no-self-argument
    class Config:
        strict = True
        coerce = True

    date_export: Series[Date] = pa.Field(  # type: ignore
        title="Export date",
        description=(
            "Date of the export, exports are made available on a yearly basis"
        ),
        coerce=True,
    )
    num_finess_et: Series[str] = pa.Field(
        title="Geographic FINESS Identifier",
        description="Geographic FINESS Identifier (ex: 920000650)",
    )
    num_finess_ej: Series[str] = pa.Field(
        title="Juridic FINESS Identifier",
        description="Identifider of the juridic entity (ex: 920150059)",
    )

    latitude: Series[float] = pa.Field(
        title="Latitude",
        description=(
            "Latitude of the location of the care center"
            "(WGS 84) (ex: 48.84512493935407)"
        ),
        nullable=True,
        le=90,
        ge=-90,
    )
    longitude: Series[float] = pa.Field(
        title="Longitude",
        description=(
            "Longitude of the location of the care center"
            "(WGS 84) (ex: 48.84512493935407)"
        ),
        nullable=True,
        le=180,
        ge=-180,
    )

    @pa.check("num_finess_e.", regex=True, name="check_num_finess_format")
    def check_num_finess_format(
        cls, num_finess_et: Series[str]
    ) -> Series[bool]:
        """
        Finess identifiers are 9 characters wide (alphanumerical)
        """
        return num_finess_et.str.match("^\\w{9}$")

    @pa.dataframe_check
    def check_coords_non_null(cls, data_df: pd.DataFrame) -> Series[bool]:
        """
        Longitude and latitude should not be null starting 2017
        """
        return (
            (data_df["date_export"].dt.year > 2017)
            & data_df["latitude"].notna()
            & data_df["longitude"].notna()
        ) | (data_df["date_export"].dt.year <= 2017)
pandera model target.check_model.TestModel[source]#

Data model with checks

class Config[source]#
strict = True#
coerce = True#
column date_export: Series[DataType(datetime64[ns])], Export date#

Date of the export, exports are made available on a yearly basis

Constraints:
  • nullable = False

  • unique = False

  • coerce = True

  • required = True

column num_finess_et: Series[str], Geographic FINESS Identifier#

Geographic FINESS Identifier (ex: 920000650)

Constraints:
  • nullable = False

  • unique = False

  • coerce = False

  • required = True

Validated by:
column num_finess_ej: Series[str], Juridic FINESS Identifier#

Identifider of the juridic entity (ex: 920150059)

Constraints:
  • nullable = False

  • unique = False

  • coerce = False

  • required = True

Validated by:
column latitude: Series[float], Latitude#

Latitude of the location of the care center(WGS 84) (ex: 48.84512493935407)

Constraints:
  • nullable = True

  • unique = False

  • coerce = False

  • required = True

Validated by:
  • greater_than_or_equal_to(-90)

  • less_than_or_equal_to(90)

column longitude: Series[float], Longitude#

Longitude of the location of the care center(WGS 84) (ex: 48.84512493935407)

Constraints:
  • nullable = True

  • unique = False

  • coerce = False

  • required = True

Validated by:
  • greater_than_or_equal_to(-180)

  • less_than_or_equal_to(180)

check check_num_finess_format[source]#

Finess identifiers are 9 characters wide (alphanumerical)

Validates:
check check_coords_non_null[source]#

Longitude and latitude should not be null starting 2017

.. autopandera_model:: target.check_model.TestModel

NB: If you want to use markdown with myst-parser, use the eval-rst directive.