Don't write tests, Generate them!

Puneeth Chaganti

Introduction

Property-based testing, anyone?

A typical test-suite

def test_strip_whitespace_with_no_argument():
    assert strip('  foo ') == 'foo'

def test_should_strip_whitespace_with_argument():
    assert strip('  foo ', ' ') == 'foo'

def test_should_strip_non_whitespace():
    assert strip('foo', 'fo') == ''

...

Example based tests

Given: Setup some example data
When: Perform actions
Then: assert output == expected

Problems?

Combinatorial explosion
Biases carry-over to tests
Tedious

State in a website

website-states

Enter Generative testing

(Property-based testing)

Property-based test, the hard way

def test_strip_random():
    for _ in range(200):
        s = random_string()
        strip_chars = random_string()
        S = strip(s, strip_chars)
        assert is_stripped(S, s, strip_chars)

def is_stripped(S, s, strip_chars):
    assert len(S) <= len(s)
    if len(S) > 0:
        assert S[0] not in set(strip_chars)
        assert S[-1] not in set(strip_chars)
    return True

random_string = [
    random.choice(string.ascii_letters)
    for _ in range(10)
]

Property based tests

Given: For random data matching a spec
When: Perform actions
Then: assert property(output)

Hypothesis - Property based testing for Python

Hypothesized test

from hypothesis import given, strategies as st

@given(st.text(), st.text())
def test_strip_hypothesis(s, strip_chars):
    S = strip(s, strip_chars)
    assert is_stripped(S, s, strip_chars)
# Ran 1 test in 0.159s

Failing output

strip = lambda x, y: x.lstrip(y)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

output = '01', input_ = '01', strip_chars = '1'

    def is_stripped(output, input_, strip_chars):
        assert len(output) <= len(input_)
        if len(output) > 0:
            assert output[0] not in set(strip_chars)
>           assert output[-1] not in set(strip_chars)
E           assert '1' not in {'1'}
E            +  where {'1'} = set('1')

test_code.py:113: AssertionError
----------------------------------------------------- Hypothesis ------------------------------------------------------
Falsifying example: test_strip(s='01', strip_chars='1')

Shrinking

Random data has lots of noise
Try to find the "simplest" failing case

To learn more, see Designing a better simplifier

Data generation

Generators for built-ins

from hypothesis import strategies as st

def sample(strategy, n=3):
    return [strategy.example() for _ in range(n)]

print(sample(st.integers()))
print(sample(st.floats()))
print(sample(st.complex_numbers()))
print(sample(st.text(max_size=3)))
print(sample(st.lists(st.integers())))

[-7435755662106, -49, -1295624]
[-9.266256382731017e+17, -0.19780830243100944, -2.4010523231296193e+61]
[(-0.99999-0.99999j), (-2.220446049250313e-16+nanj), (0.003554608069336136-1.923176004582495e-275j)]
['', '\U000ded7f9', '']
[[52647858669059, -31758544979, 71365626], [0], []]

Extra generators

Django models
Numpy arrays
Dates & times
Faker generators

Composable strategies

from hypothesis import strategies as st

st.recursive?
st.one_of?
st.builds?
st.streaming?

.map, .filter, .flatmap

Composing strategies - Example

rows = [('John', 'Adams', 90), (...), (...)]
headers = ['first_name', 'last_name', 'gpa']
print(tablib.Dataset(*rows, headers=headers))

first_name|last_name |gpa
----------|----------|---
John      |Adams     |90
George    |Washington|67
Thomas    |Jefferson |50

Generate Rows & Header

from hypothesis import strategies as st; import string

n = 3
alphabet = string.ascii_letters
generate_row = st.tuples(
    st.text(alphabet, min_size=1),
    st.text(alphabet, min_size=1),
    st.integers(min_value=0, max_value=100)
)
generate_table = st.lists(generate_row, min_size=3, max_size=3)
generate_headers = st.lists(
    st.text(alphabet, min_size=1),
    unique=True,
    min_size=n,
    max_size=n
)

Putting it together

def create_dataset(rows, headers):
    return tablib.Dataset(*rows, headers=headers)

def generate_dataset():
    return st.builds(create_dataset, generate_data, headers=generate_headers)

print(generate_dataset().example())

znefubbdv     |wpclcf|ouc
--------------|------|---
aecpjxzwfqosmu|krlmfh|55
htq           |jid   |87
lwbfboxyifre  |oqdha |83

Simple `tablib` test

def test_add_column():
    rows = [['kenneth'], ['bessie']]
    data = tablib.Dataset(*rows, headers=['fname'])
    new_col = ['reitz', 'monke']
    data.append_col(new_col, header='lname')

    assert data[0] == ('kenneth', 'reitz'))
    assert data.width == 2

to a property based test

@given(data=generate_dataset(),
       new_col=st.lists(st.text(min_size=1), min_size=3, max_size=3),
       header=st.text(min_size=3))
def test_hyp_add_column(data, new_col, header):
    first_row = data[0]
    data.append_col(new_col, header=header)

    assert data[0] == first_row + (new_col[0],)
    assert data.width == 4

Test transpose

@given(generate_dataset())
def test_transpose(self, data):
    data_ = data.transpose()

    self.assertEqual(data.width, data_.height+1)
    self.assertEqual(data.height, data_.width-1)

Round trip transpose

@given(generate_dataset())
def test_two_transposes(self, data):
    data_ = data.transpose().transpose()

    self.assertEqual(data.width, data_.width)
    self.assertEqual(data.height, data_.height)

    self.assertEqual(data.width, data_.height)
E   AssertionError: 3 != 2
------------------------------------------------ Captured stdout call -------------------------------------------------
Falsifying example:
a|b|c
-|-|-
a|a|0
a|a|0
a|a|0

Round trip to json

@given(generate_dataset())
def test_json_export_import_works(data):
    json_ = data.json
    data_ = tablib.import_set(json_)

    self.assertEqual(data.width, data_.width)
    self.assertEqual(data.height, data_.height)
    self.assertEqual(data[0], data_[0]))

    self.assertEqual(data[0], data_[0])
E   AssertionError: Tuples differ: ('a', 'a', 0) != ('a', 0, 'a')

Verification

strip tests from before

Sorting actually returns a sorted list

Computing the mean

from hypothesis import given, strategies as st

@given(st.lists(st.floats(allow_nan=False, allow_infinity=False)), min_size=1)
def test_mean_is_within_reasonable_bounds(ls):
    assert min(ls) <= mean(ls) <= max(ls)

Going by definition …

def mean(xs):
    return sum(xs) / len(xs)

ls = [8.988465674311579e+307, 8.98846567431158e+307]

    @given(st.lists(st.floats(allow_nan=False, allow_infinity=False), min_size=1))
    def test_mean_is_within_reasonable_bounds(ls):
>       assert min(ls) <= mean(ls) <= max(ls)
E       assert inf <= 8.98846567431158e+307
E        +  where inf = mean([8.988465674311579e+307, 8.98846567431158e+307])
E        +  and   8.98846567431158e+307 = max([8.988465674311579e+307, 8.98846567431158e+307])

Avoiding overflow

def mean(xs):
    n = len(xs)
    return sum(x / n  for x in xs)

ls = [1.390671161567e-309, 1.390671161567e-309, 1.390671161567e-309]

    @given(st.lists(st.floats(allow_nan=False, allow_infinity=False), min_size=1))
    def test_mean_is_within_reasonable_bounds(ls):
>       assert min(ls) <= mean(ls) <= max(ls)
E       assert 1.390671161567e-309 <= 1.390671161566996e-309
E        +  where 1.390671161567e-309 = min([1.390671161567e-309, 1.390671161567e-309, 1.390671161567e-309])
E        +  and   1.390671161566996e-309 = mean([1.390671161567e-309, 1.390671161567e-309, 1.390671161567e-309])

For instance, `numpy`

import numpy as np
def mean(xs):
    return np.array(xs).mean()

ls = [8.988465674311579e+307, 8.98846567431158e+307]

    @given(st.lists(st.floats(allow_nan=False, allow_infinity=False), min_size=1))
    def test_mean_is_within_reasonable_bounds(ls):
>       assert min(ls) <= mean(ls) <= max(ls)
E       assert inf <= 8.98846567431158e+307
E        +  where inf = mean([8.988465674311579e+307, 8.98846567431158e+307])
E        +  and   8.98846567431158e+307 = max([8.988465674311579e+307, 8.98846567431158e+307])

Read this 30 page paper, to see how to do it right!

Test Oracle

from hypothesis import strategies as st, given
from my_lib import my_sort

@given(st.lists(st.integers()))
def test_my_sort(xs):
    assert sorted(xs) == my_sort(xs)

More patterns

See talk by Jeremy Thurgood

Induction
Transformation
Invariance
Idempotence

Keep in mind

Fast data generation
Fast assertions
Simple looking, yet powerful
Re-use?

Stateful testing

def test_website():
    assert login(credentials)
    assert go_to_homepage()
    assert follow_friend()
    assert logout()

Pseudocode example

class WebSiteStateMachine(RuleBasedStateMachine):
    def __init__(self):
        super(WebSiteStateMachine, self).__init__()

    def login(self):
        """Login using credentials and assert success."""

    @rule()
    def logout(self):
        """Logout and assert it worksn."""

    @rule(user=st.sampled_from(USERS))
    def follow_user(self, user):
        """Assert that following a user works."""

WebSiteTestCase = WebSiteStateMachine.TestCase

Problems with Generative Testing?

Performance
Debugging CI failures
Rare branches?

Conclusion

Property based tests

Concise
Overcome developer biases
Assert general things

Hypothesis

Generate data, given a requirement
Check that a property holds true
Shrink failed cases to simplest case

Some interesting case studies

John Hughes: Testing the hard stuff and staying sane
Ashton Kemerling: Generative Integration Testing
Sean Grove: Generating and Running 1M tests

Pairing anyone?

Thank you

@punchagan

CC-BY-SA 4.0

http://tinyurl.com/pygentest

Don't write tests, Generate them!

Puneeth Chaganti

Introduction

Property-based testing, anyone?

A typical test-suite

Example based tests

Problems?

State in a website

Enter Generative testing

Property-based test, the hard way

Property based tests

Hypothesis - Property based testing for Python

Hypothesized test

Failing output

Shrinking

Data generation

Generators for built-ins

Extra generators

Composable strategies

Composing strategies - Example

Generate Rows & Header

Putting it together

Simple tablib test

to a property based test

Test transpose

Round trip transpose

Round trip to json

Verification

Computing the mean

Going by definition …

Avoiding overflow

For instance, numpy

Test Oracle

More patterns

Keep in mind

Stateful testing

Pseudocode example

Problems with Generative Testing?

Conclusion

Property based tests

Hypothesis

Some interesting case studies

Pairing anyone?

Thank you

Simple `tablib` test

For instance, `numpy`