diff --git a/Readme.md b/Readme.md index c89b617..eb4b0f3 100644 --- a/Readme.md +++ b/Readme.md @@ -6,9 +6,9 @@ Find the difference between writing Python code vs compiling C into a shared lib ## Method -A good comparison is storing and searching strings. A trie data structure is implemented with 2 methods - add and find. 2 implementations - Python and C compiled into a dynamic library loaded by the CPython interpreter are tested and benchmarked against each other to find out their memory footprint and speed. +A good comparison is storing and searching strings in a trie. A trie data structure is implemented with 2 methods - add and find. 2 implementations - Python and C compiled into a dynamic library loaded by the CPython interpreter are tested and benchmarked against each other to find out their memory footprint and speed. -While C and the conventions of CPython extension hacking are tricky to get your head around. +While C and the conventions of CPython extension hacking are tricky to get your head around, they are hopefully worth up for applications with high performance requirements. ## Test @@ -28,7 +28,7 @@ Using a bit of bash magic, we can prepare a lowercase-only, clean of punctuation head -n 50000 /usr/share/dict/words | tail -n 20000 | tr -d "[A-Z|']" | iconv -f utf8 -t ascii//TRANSLIT | uniq | head -n 18000 > clean_words ``` -Take the first 50000 words from unix dictionary fail, take the words from the middle, remove all uppercase letters and apostrophes, convert/transliterate all non-ascii chars to ascii and pipe the top 18000 unique words into the clean_words file +Take the first 50000 words from unix dictionary file, take the words from the middle, remove all uppercase letters and apostrophes, convert/transliterate all non-ascii chars to ascii and pipe the top 18000 unique words into the clean_words file #### Random order @@ -45,31 +45,32 @@ shuf clean_words > random_words head -n 80000 /usr/share/dict/words | tail -n 1000 | tr -d "[A-Z|']" | iconv -f utf8 -t ascii//TRANSLIT | uniq | head -n 800 > missing_words ``` +### Measure -### Time to add 10000 words +#### Time to add 10000 words Absolute time -#### In alphabetic order +##### In alphabetic order -#### In random order +##### In random order -### Memory footprint +#### Memory footprint The size of trie object after all the words have been added. -### Time to find existing words +#### Time to find existing words Average duration of finding same 50 words that are in the trie -### Time to look for missing words +#### Time to look for missing words Average duration of looking for same 50 words that aren't in the trie ## Results -tbd +Published in a blog and delivered as a presentation. ## Conclusion diff --git a/benchmark.py b/benchmark.py index 33f7a90..591e8d3 100755 --- a/benchmark.py +++ b/benchmark.py @@ -1,44 +1,43 @@ #! /usr/bin/env python3 -import subprocess import time from py_trie import PyTrie from ctrie import cTrie +time_fmt = ":5f" + def add_sorted_words(trie_constructor): - print("\n\n{}".format(trie_constructor.__name__)) startTime = time.time() tr = trie_constructor() creationTime = time.time() - startTime - print("{} takes {}s to instantiate".format( - trie_constructor.__name__, creationTime)) + print("Takes {:5f}s to instantiate before adding sorted words".format( + creationTime)) with open("clean_words") as f: for line in f: tr.add(line.strip()) elapsedTime = time.time() - startTime - print("It takes {}s to add 18000 words to a {}".format( - elapsedTime, trie_constructor.__name__)) + print("Takes {:5f}s to add 18000 words".format(elapsedTime)) def add_random_words(trie_constructor): - print("\n\n{}".format(trie_constructor.__name__)) + startTime = time.time() tr = trie_constructor() creationTime = time.time() - startTime - print("{} takes {}s to instantiate".format( - trie_constructor.__name__, creationTime)) + print("Takes {:5f}s to instantiate before adding random words".format( + creationTime)) with open("random_words") as f: for line in f: tr.add(line.strip()) elapsedTime = time.time() - startTime - print("It takes {}s to add 18000 random words to a {}".format( - elapsedTime, trie_constructor.__name__)) + print("Takes {:5f}s to add 18000 random words".format( + elapsedTime)) def find_present_words(trie_constructor): - print("\n\n{}".format(trie_constructor.__name__)) + tr = trie_constructor() with open("clean_words") as f: for line in f: @@ -49,12 +48,11 @@ def find_present_words(trie_constructor): for line in f: tr.find(line.strip()) elapsedTime = time.time() - startTime - print("It takes {}s to find 100 random words in a {}".format( - elapsedTime, trie_constructor.__name__)) + print("Takes {:5f}s to find 100 random words".format( + elapsedTime)) def find_missing_words(trie_constructor): - print("\n\n{}".format(trie_constructor.__name__)) tr = trie_constructor() with open("clean_words") as f: @@ -67,12 +65,13 @@ def find_missing_words(trie_constructor): for line in f: tr.find(line.strip()) elapsedTime = time.time() - startTime - print("It takes {}s to look for, but fail to find, 800 missing words in a {}".format( - elapsedTime, trie_constructor.__name__)) + print("Takes {:5f}s to look for, but fail to find, 800 missing words".format( + elapsedTime)) if __name__ == "__main__": - for trie in [cTrie, PyTrie]: + for trie in [PyTrie, cTrie]: + print(trie.__name__) add_sorted_words(trie) add_random_words(trie) find_present_words(trie) diff --git a/makefile b/makefile index 833fefb..185da09 100644 --- a/makefile +++ b/makefile @@ -12,3 +12,6 @@ ext: test: python3 -m pytest test.py + +bench: clean ext test + python3 benchmark.py