Compare commits

...

8 commits

Author SHA1 Message Date
03b9c28554 bump version 2020-03-30 10:48:42 -07:00
0c845a5ad8 remove unused code 2020-03-30 10:43:06 -07:00
2f0b985acf add qmnist dataset 2020-03-30 10:41:09 -07:00
988f481bc0 bump version 2018-07-02 15:56:24 +02:00
0aee9edce6 fix downloading and verification of first files
this code would've worked if the first element of `names`
were the prefix i.e. if it acted on `meta` instead of `names`.
pretty embarassing!
2018-07-02 15:50:05 +02:00
c83a306cbe bump version 2018-03-24 11:52:33 +01:00
05eeb6cbb5 support python 3.5, fix exceptions 2018-03-24 11:51:56 +01:00
4b564e96ba update todo 2018-03-24 11:27:40 +01:00
7 changed files with 49 additions and 31 deletions

View file

@ -7,7 +7,7 @@ and checked for integrity by SHA-256 hashes.
### dependencies ### dependencies
python 3.6 (or later), numpy. python 3.5 (or later), numpy.
### install ### install
@ -66,7 +66,9 @@ in alphabetical order, using default `mnists.prepare` arguments:
| [emnist][] | emnist\_mnist | (60000, 1, 28, 28) | (60000, 10) | (10000, 1, 28, 28) | (10000, 10) | | [emnist][] | emnist\_mnist | (60000, 1, 28, 28) | (60000, 10) | (10000, 1, 28, 28) | (10000, 10) |
| [fashion-mnist][] | fashion\_mnist | (60000, 1, 28, 28) | (60000, 10) | (10000, 1, 28, 28) | (10000, 10) | | [fashion-mnist][] | fashion\_mnist | (60000, 1, 28, 28) | (60000, 10) | (10000, 1, 28, 28) | (10000, 10) |
| [mnist][] | mnist | (60000, 1, 28, 28) | (60000, 10) | (10000, 1, 28, 28) | (10000, 10) | | [mnist][] | mnist | (60000, 1, 28, 28) | (60000, 10) | (10000, 1, 28, 28) | (10000, 10) |
| [qmnist][] | qmnist | (60000, 1, 28, 28) | (60000, 10) | (60000, 1, 28, 28) | (60000, 10) |
[emnist]: //www.nist.gov/itl/iad/image-group/emnist-dataset [emnist]: //www.nist.gov/itl/iad/image-group/emnist-dataset
[fashion-mnist]: //github.com/zalandoresearch/fashion-mnist [fashion-mnist]: //github.com/zalandoresearch/fashion-mnist
[mnist]: http://yann.lecun.com/exdb/mnist/ [mnist]: http://yann.lecun.com/exdb/mnist/
[qmnist]: //github.com/facebookresearch/qmnist

16
TODO
View file

@ -1,14 +1,22 @@
TODO ## TODO
* finish writing README * finish writing README
* document prepare() function * document prepare() function
* support python 3.5
* adjust dates created/modified on server-hosted files to something sensible * adjust dates created/modified on server-hosted files to something sensible
* basic tests (including PEP 8) * host files on a more reliable service
* add support for mirrors
* basic tests (including running pycodestyle)
* --fix (delete corrupt files) and --debug (logging.DEBUG) __main__ arguments
* try python 3.2 with an old version of numpy (don't care if it doesn't work)
### release version 1.0
* submit to pypi * submit to pypi

View file

@ -3,7 +3,7 @@
# Copyright (C) 2018 Connor Olding # Copyright (C) 2018 Connor Olding
# Distributed under terms of the MIT license. # Distributed under terms of the MIT license.
__version__ = "0.3.0" __version__ = "0.4.0"
import array import array
import gzip import gzip
@ -27,7 +27,7 @@ output_directory = os.path.join(home, ".mnist")
webhost = "https://eaguru.guru/mnist/" webhost = "https://eaguru.guru/mnist/"
def _make_meta(train_part="train", test_part="t10k", prefix=""): def _make_meta(prefix, train_part="train", test_part="t10k"):
images_suffix = "-images-idx3-ubyte.gz" images_suffix = "-images-idx3-ubyte.gz"
labels_suffix = "-labels-idx1-ubyte.gz" labels_suffix = "-labels-idx1-ubyte.gz"
return (prefix, return (prefix,
@ -37,19 +37,21 @@ def _make_meta(train_part="train", test_part="t10k", prefix=""):
test_part + labels_suffix) test_part + labels_suffix)
def _emnist_meta(name): def _make_meta2(name):
return _make_meta(name + "-train", name + "-test", prefix="emnist") prefix, _, _ = name.partition("-")
return _make_meta(prefix, name + "-train", name + "-test")
metadata = dict( metadata = dict(
emnist_balanced=_emnist_meta("emnist-balanced"), emnist_balanced=_make_meta2("emnist-balanced"),
emnist_byclass=_emnist_meta("emnist-byclass"), emnist_byclass=_make_meta2("emnist-byclass"),
emnist_bymerge=_emnist_meta("emnist-bymerge"), emnist_bymerge=_make_meta2("emnist-bymerge"),
emnist_digits=_emnist_meta("emnist-digits"), emnist_digits=_make_meta2("emnist-digits"),
emnist_letters=_emnist_meta("emnist-letters"), emnist_letters=_make_meta2("emnist-letters"),
emnist_mnist=_emnist_meta("emnist-mnist"), emnist_mnist=_make_meta2("emnist-mnist"),
fashion_mnist=_make_meta(prefix="fashion-mnist"), fashion_mnist=_make_meta("fashion-mnist"),
mnist=_make_meta(prefix="mnist"), mnist=_make_meta("mnist"),
qmnist=_make_meta2("qmnist"),
) )
@ -90,14 +92,15 @@ def validate(name):
if name not in hashes.keys(): if name not in hashes.keys():
raise UnknownDatasetError(name) raise UnknownDatasetError(name)
with open(construct_path(name), "rb") as f: path = construct_path(name)
with open(path, "rb") as f:
data = f.read() data = f.read()
known_hash = hashes[name] known_hash = hashes[name]
hash = hashlib.sha256(data).hexdigest() hash = hashlib.sha256(data).hexdigest()
if hash != known_hash: if hash != known_hash:
raise IntegrityError(file, known_hash, hash) raise IntegrityError(path, known_hash, hash)
def onehot(ind): def onehot(ind):
@ -159,17 +162,13 @@ def prepare(dataset="mnist", return_floats=True, return_onehot=True,
prefix, names = meta[0], meta[1:] prefix, names = meta[0], meta[1:]
names = [os.path.join(prefix, name) for name in names] names = [os.path.join(prefix, name) for name in names]
train_images, train_labels, test_images, test_labels = names train_images, train_labels, test_images, test_labels = names
images_and_labels = names[1:]
logger.debug("Filenames chosen for %s: %s, %s, %s, %s", logger.debug("Filenames chosen for %s: %s, %s, %s, %s",
dataset, train_images, train_labels, test_images, test_labels) dataset, train_images, train_labels, test_images, test_labels)
make_directories() make_directories()
existing = [os.path.isfile(construct_path(name)) for name in names] for name in names:
gz_existing = existing[0], existing[1:]
for name in images_and_labels:
download(name) download(name)
if check_integrity: if check_integrity:
validate(name) validate(name)

View file

@ -21,6 +21,7 @@ urls = {
"emnist": "//www.nist.gov/itl/iad/image-group/emnist-dataset", "emnist": "//www.nist.gov/itl/iad/image-group/emnist-dataset",
"fashion-mnist": "//github.com/zalandoresearch/fashion-mnist", "fashion-mnist": "//github.com/zalandoresearch/fashion-mnist",
"mnist": "http://yann.lecun.com/exdb/mnist/", "mnist": "http://yann.lecun.com/exdb/mnist/",
"qmnist": "//github.com/facebookresearch/qmnist",
} }
print(row.format(*headers)) print(row.format(*headers))
@ -31,7 +32,7 @@ for name in metadata.keys():
# print out the shape table for use in the README. # print out the shape table for use in the README.
data = prepare(name) data = prepare(name)
prefix = metadata[name][0] prefix = metadata[name][0]
row_data = [f"[{prefix}][]"] row_data = ["[{}][]".format(prefix)]
row_data += [name.replace("_", "\\_")] row_data += [name.replace("_", "\\_")]
row_data += [str(array.shape) for array in data] row_data += [str(array.shape) for array in data]
print(row.format(*row_data)) print(row.format(*row_data))
@ -39,4 +40,4 @@ for name in metadata.keys():
print() print()
for anchor, url in urls.items(): for anchor, url in urls.items():
print(f"[{anchor}]: {url}") print("[{}]: {}".format(anchor, url))

View file

@ -5,9 +5,9 @@ class IntegrityError(Exception):
self.computed_hash = computed_hash self.computed_hash = computed_hash
def __str__(self): def __str__(self):
return f"""Failed to validate dataset: {name} return """Failed to validate dataset: {self.file}
Hash mismatch: {self.computed_hash} should be {self.expected_hash} Hash mismatch: {self.computed_hash} should be {self.expected_hash}
Please check your local file for tampering or corruption.""" Please check your local file for tampering or corruption.""".format(self=self)
class UnknownDatasetError(Exception): class UnknownDatasetError(Exception):
@ -15,4 +15,4 @@ class UnknownDatasetError(Exception):
self.dataset = dataset self.dataset = dataset
def __str__(self): def __str__(self):
return f"Unknown mnist-like dataset: {dataset}" return "Unknown mnist-like dataset: {self.dataset}".format(self=self)

View file

@ -63,4 +63,12 @@ hashes = {
'440fcabf73cc546fa21475e81ea370265605f56be210a4024d2ca8f203523609', '440fcabf73cc546fa21475e81ea370265605f56be210a4024d2ca8f203523609',
'mnist/train-labels-idx1-ubyte.gz': 'mnist/train-labels-idx1-ubyte.gz':
'3552534a0a558bbed6aed32b30c495cca23d567ec52cac8be1a0730e8010255c', '3552534a0a558bbed6aed32b30c495cca23d567ec52cac8be1a0730e8010255c',
'qmnist/qmnist-test-images-idx3-ubyte.gz':
'43fc22bf7498b8fc98de98369d72f752d0deabc280a43a7bcc364ab19e57b375',
'qmnist/qmnist-test-labels-idx1-ubyte.gz':
'3f384004f51536c2a29f2ce4d36388ee6cb8fff45bc3ad0cc588a86f2cc76375',
'qmnist/qmnist-train-images-idx3-ubyte.gz':
'9e26a7bf1683614e065d7b76460ccd52807165b3f22561fb782bd9f38c52b51d',
'qmnist/qmnist-train-labels-idx1-ubyte.gz':
'3552534a0a558bbed6aed32b30c495cca23d567ec52cac8be1a0730e8010255c',
} }

View file

@ -2,7 +2,7 @@ from setuptools import setup
setup( setup(
name='mnists', name='mnists',
version='0.3.0', version='0.4.0',
packages=[ packages=[
'mnists', 'mnists',
], ],
@ -22,7 +22,7 @@ setup(
'Natural Language :: English', 'Natural Language :: English',
'Programming Language :: Python', 'Programming Language :: Python',
'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.5',
'Topic :: Scientific/Engineering', 'Topic :: Scientific/Engineering',
] ]
) )