validium validators for pandas dataframes
pip install validframeNeed some faith in those frames? Let's dive in.
Out-of-the-box you get a set of validator factories to handle the considerably more common ways to validate dataframes:
import pandas as pd
import numpy as np
df = pd.DataFrame(
  columns = ['like_counts','comment'], # headers
  data = [
    [42, 'hello world'], # row 0
    [100000, 'π'], # row 1
    [123456, 'lol'], # row 2
    [987, "you're the baz"] # row 3
  ])
validators = [
  vf.frame.not_empty(), # frame must be not empty
  vf.frame.empty(), # frame must be empty
  vf.frame.rows(4), # frame must have 4 rows
  vf.frame.rows(100), # frame must have 100 rows
  vf.frame.cols(2), # frame must have 2 cols
  vf.rows.uniq(), # rows must be unique
  vf.cells.all_is(str, cols=['comment']), # all cells must be instances of <str>
  vf.cells.all_eq(1, cols=['like_counts']), # all cells must equal 1
  vf.cells.all_gt(0, cols=['like_counts']), # all cells must be greater than 0
  vf.cells.all_lt(0, cols=['like_counts']), # all cells must be less than 0
  vf.cells.all_gte(0, cols=['like_counts']), # all cells must be greater than or equal to 0
  vf.cells.all_lte(0, cols=['like_counts']), # all cells must be less than or equal to 0
  vf.cells.some_eq(42, cols=['like_counts']), # some cells must equal 42
  vf.cells.some_is(np.nan, cols=['comment']), # some cells must be instances of <numpy.nan>
  vf.cells.some_gt(100000, cols=['like_counts']), # some cells must be greater than 100000
  vf.cells.some_lt(987, cols=['like_counts']), # some cells must be less than 987
  vf.cells.some_gte(100000, cols=['like_counts']), # some cells must be greater than or equal to 100000
  vf.cells.some_lte(987, cols=['like_counts']), # some cells must be less than or equal to 987
  vf.cells.none_eq(0, cols=['like_counts']), # no cells must equal 0
  vf.cells.none_is(str, cols=['like_counts']), # no cells must be instances of <str>
  vf.cells.none_gt(100000, cols=['like_counts']), # no cells must be greater than 100000
  vf.cells.none_lt(42, cols=['like_counts']), # no cells must be less than 42
  vf.cells.none_gte(100000, cols=['like_counts']), # no cells must be greater than or equal to 100000
  vf.cells.none_lte(42, cols=['like_counts']), # no cells must be less than or equal to 42   
  vf.cells.some_or_none_is(str, cols=['comment']), # some or no cells must be instances of <str>
  vf.cells.some_or_none_eq(0, cols=['like_counts']), # some or no cells must equal 0
  vf.cells.some_or_none_gt(0, cols=['like_counts']), # some or no cells must be greater than 0
  vf.cells.some_or_none_lt(0, cols=['like_counts']), # some or no cells must be less than 0
  vf.cells.some_or_none_gte(0, cols=['like_counts']), # some or no cells must be greater than or equal to 0
  vf.cells.some_or_none_lte(0, cols=['like_counts']), # some or no cells must be less than or equal to 0
  vf.cells.all_or_none_is(str, cols=['comment']), # all or no cells must be instances of <str>
  vf.cells.all_or_none_eq(42, cols=['like_counts']), # all or no cells must equal 42
  vf.cells.all_or_none_gt(100000, cols=['like_counts']), # all or no cells must be greater than 100000
  vf.cells.all_or_none_lt(987, cols=['like_counts']), # all or no cells must be less than 987
  vf.cells.all_or_none_gte(100000, cols=['like_counts']), # all or no cells must be greater than or equal to 100000
  vf.cells.all_or_none_lte(987, cols=['like_counts']), # all or no cells must be less than or equal to 987
  vf.cells.all_or_some_is(str, cols=['comment']), # all or some cells must be instances of <str>
  vf.cells.all_or_some_eq(0, cols=['like_counts']), # all or some cells must equal 0
  vf.cells.all_or_some_gt(100000, cols=['like_counts']), # all or some cells must be greater than 100000
  vf.cells.all_or_some_lt(42, cols=['like_counts']), # all or some cells must be less than 42
  vf.cells.all_or_some_gte(100000, cols=['like_counts']), # all or some cells must be greater than or equal to 100000
  vf.cells.all_or_some_lte(42, cols=['like_counts']), # all or some cells must be less than or equal to 42   
  vf.cells.sum_eq(-1, cols=['like_counts']), # all cells summed must equal -1
  vf.cells.sum_gt(0, cols=['like_counts']), # all cells summed must be greater than 0
  vf.cells.sum_lt(0, cols=['like_counts']), # all cells summed must be less than 0
  vf.cells.sum_gte(0, cols=['like_counts']), # all cells must be greater than or equal to 0
  vf.cells.sum_lte(0, cols=['like_counts']), # all cells must be less than or equal to 0
  vf.cells.uniq(cols=['comments']) # all cells must be unique
]
for v in validators:
  try:
    v.validate(df)
  except AssertionError as err
    print(err)
# AssertionError: frame must be empty
# AssertionError: frame must have 100 rows
# AssertionError: (cols=['like_counts']) all cells must equal 1
# AssertionError: (cols=['like_counts']) all cells must be less than 0
# AssertionError: (cols=['like_counts']) all cells must be less than or equal 0
# AssertionError: (cols=['comment']) some cells must be instances of <numpy.nan>
# AssertionError: (cols=['like_counts']) some cells must be greater than 100000
# AssertionError: (cols=['like_counts']) some cells must be less than 987
# AssertionError: (cols=['like_counts']) no cells must be greater than or equal to 100000
# AssertionError: (cols=['like_counts']) no cells must be less than or equal to 42
# AssertionError: (cols=['comment']) some or no cells must be instances of <str>
# AssertionError: (cols=['like_counts']) some or no cells must be greater than 0
# AssertionError: (cols=['like_counts']) some or no cells must be greater than or equal to 0
# AssertionError: (cols=['like_counts']) all or no cells must equal 42
# AssertionError: (cols=['like_counts']) all or no cells must be greater than or equal to 100000
# AssertionError: (cols=['like_counts']) all or no cells must be less than or equal to 987
# AssertionError: (cols=['like_counts']) all or some cells must equal 0
# AssertionError: (cols=['like_counts']) all or some cells must be greater than 100000
# AssertionError: (cols=['like_counts']) all or some cells must be less than 42
# AssertionError: (cols=['like_counts']) all cells summed must be less than 0 Not quite exhaustive, but enough to cover basic use.
Think there are some other common validators that are missing here? Proposals via issues and PRs are welcomed π
When none of the predefined validators can do the trick, well its time to roll up your sleeves and create your own validator.
For starters you can create a CellsValidator to validate dataframes by their cells:
import validframe as vf
df = pd.DataFrame(
  columns: ['like_counts','comment'], # headers
  data: [
    [42, 'hello world'], # row 0
    [100000, 'π'], # row 1
    [123456, 'lol'], # row 2
    [987, 'earth is definitely flat'] # row 3
  ])
alotta_likes_validator = vf.CellsValidator(
  lambda xs: all([x >= 1000 for x in xs]),
  'all like counts must be atleast 1000'
  cols=['like_counts']
)
alotta_likes_validator.validate(df) # AssertionError: all likes must be atleast 1000You can also create a RowsValidator to validate dataframes by their rows:
df = pd.DataFrame(
  columns: ['date', 'total', 'subtotal', 'tax'], # headers\
  data: [
    ['2020-01-11', 108.25, 100, 8.25], 
    ['2010-01-11', 106, 100, 6], 
    ['2009-01-11', 104.50, 100, 4.50] 
  ])
total_validator = vf.RowsValidator(
  lambda rows: all([row['total'] == row['sub_total'] + row['tax'] for row in rows]),
  'all rows must have total equal the sub-total plus tax',
  cols=['total', 'sub_total', 'tax']
)
total_validator.validate(df) # passIf you really enjoy pandas then you might prefer to create a FrameValidator to validate dataframes utilizing pandas and numpy to write the logic:
import pandas as pd
import numpy as np
ledger_df = pd.DataFrame(
  columns = ['company', 'balance'],
  data = [
    ['Google', 100000], 
    ['Google', -90000], 
    ['Netflix', -10000], # will be unbalanced
    ['Amazon', 0], 
    ['Google', -10000], 
  ]
)
def is_balanced_by_company(df):
  pivot_df = df.pivot_table(values='balance', columns=['company'], aggfunc=np.sum)
  return pivot_df[pivot_df == 0].count().sum() == 0
balanced_validator = vf.FrameValidator(
  is_balanced_by_company,
  'sum of balances for every company must equals 0'
)
balanced_validator.validate(ledger_df) # AssertionError: sum of balances for every company must equals 0As with validium validators in general, using a functional programming library like ramda can add brevity and readability to the code for your validation logic.
import ramda as R
# same as above
all_gt_zero_validator = vf.CellsValidator(
  R.all(lambda x: x>0),
  'all cells must be greater than 0'
  cols=['a']
)This is especially true when your validation logic start to become a bit more complex:
sum_numbers_eq_zero_validator = vf.CellsValidator(
  R.compose(R.equals(0), R.sum, R.filter(lambda x: isinstance(x, Number)),
  'all cells that are numbers summed must be greater than 0'
  cols=['credit', 'debit']
)Another recommendation would be to use a function instead of a lambda when your validation logic can't be expressed comfortably as a onliner, eg. your logic involves making a request to a web API:
import pandas as pd
import request
def match_remote_checksums(df):
  checksums = request.get(REMOTE_CHECKSUM_URL) # just imagine
  remote_df = pd.DataFrame({'checksum': checksums})
  return df.equals(remote_df)
# as a oneliner:
# match_remote_checksums = lambda df: pd.DataFrame({'checksum': request.get(REMOTE_CHECKSUM_URL)}).equals(df)
validator = vf.FrameValidator(
  match_remote_checksums, 
  'checksums must match the set from the server', 
  cols=['checksum']
)