Compare commits
8 commits
Author | SHA1 | Date | |
---|---|---|---|
03b9c28554 | |||
0c845a5ad8 | |||
2f0b985acf | |||
988f481bc0 | |||
0aee9edce6 | |||
c83a306cbe | |||
05eeb6cbb5 | |||
4b564e96ba |
7 changed files with 49 additions and 31 deletions
|
@ -7,7 +7,7 @@ and checked for integrity by SHA-256 hashes.
|
||||||
|
|
||||||
### dependencies
|
### dependencies
|
||||||
|
|
||||||
python 3.6 (or later), numpy.
|
python 3.5 (or later), numpy.
|
||||||
|
|
||||||
### install
|
### install
|
||||||
|
|
||||||
|
@ -66,7 +66,9 @@ in alphabetical order, using default `mnists.prepare` arguments:
|
||||||
| [emnist][] | emnist\_mnist | (60000, 1, 28, 28) | (60000, 10) | (10000, 1, 28, 28) | (10000, 10) |
|
| [emnist][] | emnist\_mnist | (60000, 1, 28, 28) | (60000, 10) | (10000, 1, 28, 28) | (10000, 10) |
|
||||||
| [fashion-mnist][] | fashion\_mnist | (60000, 1, 28, 28) | (60000, 10) | (10000, 1, 28, 28) | (10000, 10) |
|
| [fashion-mnist][] | fashion\_mnist | (60000, 1, 28, 28) | (60000, 10) | (10000, 1, 28, 28) | (10000, 10) |
|
||||||
| [mnist][] | mnist | (60000, 1, 28, 28) | (60000, 10) | (10000, 1, 28, 28) | (10000, 10) |
|
| [mnist][] | mnist | (60000, 1, 28, 28) | (60000, 10) | (10000, 1, 28, 28) | (10000, 10) |
|
||||||
|
| [qmnist][] | qmnist | (60000, 1, 28, 28) | (60000, 10) | (60000, 1, 28, 28) | (60000, 10) |
|
||||||
|
|
||||||
[emnist]: //www.nist.gov/itl/iad/image-group/emnist-dataset
|
[emnist]: //www.nist.gov/itl/iad/image-group/emnist-dataset
|
||||||
[fashion-mnist]: //github.com/zalandoresearch/fashion-mnist
|
[fashion-mnist]: //github.com/zalandoresearch/fashion-mnist
|
||||||
[mnist]: http://yann.lecun.com/exdb/mnist/
|
[mnist]: http://yann.lecun.com/exdb/mnist/
|
||||||
|
[qmnist]: //github.com/facebookresearch/qmnist
|
||||||
|
|
16
TODO
16
TODO
|
@ -1,14 +1,22 @@
|
||||||
TODO
|
## TODO
|
||||||
|
|
||||||
* finish writing README
|
* finish writing README
|
||||||
|
|
||||||
* document prepare() function
|
* document prepare() function
|
||||||
|
|
||||||
* support python 3.5
|
|
||||||
|
|
||||||
* adjust dates created/modified on server-hosted files to something sensible
|
* adjust dates created/modified on server-hosted files to something sensible
|
||||||
|
|
||||||
* basic tests (including PEP 8)
|
* host files on a more reliable service
|
||||||
|
|
||||||
|
* add support for mirrors
|
||||||
|
|
||||||
|
* basic tests (including running pycodestyle)
|
||||||
|
|
||||||
|
* --fix (delete corrupt files) and --debug (logging.DEBUG) __main__ arguments
|
||||||
|
|
||||||
|
* try python 3.2 with an old version of numpy (don't care if it doesn't work)
|
||||||
|
|
||||||
|
### release version 1.0
|
||||||
|
|
||||||
* submit to pypi
|
* submit to pypi
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
# Copyright (C) 2018 Connor Olding
|
# Copyright (C) 2018 Connor Olding
|
||||||
# Distributed under terms of the MIT license.
|
# Distributed under terms of the MIT license.
|
||||||
|
|
||||||
__version__ = "0.3.0"
|
__version__ = "0.4.0"
|
||||||
|
|
||||||
import array
|
import array
|
||||||
import gzip
|
import gzip
|
||||||
|
@ -27,7 +27,7 @@ output_directory = os.path.join(home, ".mnist")
|
||||||
webhost = "https://eaguru.guru/mnist/"
|
webhost = "https://eaguru.guru/mnist/"
|
||||||
|
|
||||||
|
|
||||||
def _make_meta(train_part="train", test_part="t10k", prefix=""):
|
def _make_meta(prefix, train_part="train", test_part="t10k"):
|
||||||
images_suffix = "-images-idx3-ubyte.gz"
|
images_suffix = "-images-idx3-ubyte.gz"
|
||||||
labels_suffix = "-labels-idx1-ubyte.gz"
|
labels_suffix = "-labels-idx1-ubyte.gz"
|
||||||
return (prefix,
|
return (prefix,
|
||||||
|
@ -37,19 +37,21 @@ def _make_meta(train_part="train", test_part="t10k", prefix=""):
|
||||||
test_part + labels_suffix)
|
test_part + labels_suffix)
|
||||||
|
|
||||||
|
|
||||||
def _emnist_meta(name):
|
def _make_meta2(name):
|
||||||
return _make_meta(name + "-train", name + "-test", prefix="emnist")
|
prefix, _, _ = name.partition("-")
|
||||||
|
return _make_meta(prefix, name + "-train", name + "-test")
|
||||||
|
|
||||||
|
|
||||||
metadata = dict(
|
metadata = dict(
|
||||||
emnist_balanced=_emnist_meta("emnist-balanced"),
|
emnist_balanced=_make_meta2("emnist-balanced"),
|
||||||
emnist_byclass=_emnist_meta("emnist-byclass"),
|
emnist_byclass=_make_meta2("emnist-byclass"),
|
||||||
emnist_bymerge=_emnist_meta("emnist-bymerge"),
|
emnist_bymerge=_make_meta2("emnist-bymerge"),
|
||||||
emnist_digits=_emnist_meta("emnist-digits"),
|
emnist_digits=_make_meta2("emnist-digits"),
|
||||||
emnist_letters=_emnist_meta("emnist-letters"),
|
emnist_letters=_make_meta2("emnist-letters"),
|
||||||
emnist_mnist=_emnist_meta("emnist-mnist"),
|
emnist_mnist=_make_meta2("emnist-mnist"),
|
||||||
fashion_mnist=_make_meta(prefix="fashion-mnist"),
|
fashion_mnist=_make_meta("fashion-mnist"),
|
||||||
mnist=_make_meta(prefix="mnist"),
|
mnist=_make_meta("mnist"),
|
||||||
|
qmnist=_make_meta2("qmnist"),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -90,14 +92,15 @@ def validate(name):
|
||||||
if name not in hashes.keys():
|
if name not in hashes.keys():
|
||||||
raise UnknownDatasetError(name)
|
raise UnknownDatasetError(name)
|
||||||
|
|
||||||
with open(construct_path(name), "rb") as f:
|
path = construct_path(name)
|
||||||
|
with open(path, "rb") as f:
|
||||||
data = f.read()
|
data = f.read()
|
||||||
|
|
||||||
known_hash = hashes[name]
|
known_hash = hashes[name]
|
||||||
hash = hashlib.sha256(data).hexdigest()
|
hash = hashlib.sha256(data).hexdigest()
|
||||||
|
|
||||||
if hash != known_hash:
|
if hash != known_hash:
|
||||||
raise IntegrityError(file, known_hash, hash)
|
raise IntegrityError(path, known_hash, hash)
|
||||||
|
|
||||||
|
|
||||||
def onehot(ind):
|
def onehot(ind):
|
||||||
|
@ -159,17 +162,13 @@ def prepare(dataset="mnist", return_floats=True, return_onehot=True,
|
||||||
prefix, names = meta[0], meta[1:]
|
prefix, names = meta[0], meta[1:]
|
||||||
names = [os.path.join(prefix, name) for name in names]
|
names = [os.path.join(prefix, name) for name in names]
|
||||||
train_images, train_labels, test_images, test_labels = names
|
train_images, train_labels, test_images, test_labels = names
|
||||||
images_and_labels = names[1:]
|
|
||||||
|
|
||||||
logger.debug("Filenames chosen for %s: %s, %s, %s, %s",
|
logger.debug("Filenames chosen for %s: %s, %s, %s, %s",
|
||||||
dataset, train_images, train_labels, test_images, test_labels)
|
dataset, train_images, train_labels, test_images, test_labels)
|
||||||
|
|
||||||
make_directories()
|
make_directories()
|
||||||
|
|
||||||
existing = [os.path.isfile(construct_path(name)) for name in names]
|
for name in names:
|
||||||
gz_existing = existing[0], existing[1:]
|
|
||||||
|
|
||||||
for name in images_and_labels:
|
|
||||||
download(name)
|
download(name)
|
||||||
if check_integrity:
|
if check_integrity:
|
||||||
validate(name)
|
validate(name)
|
||||||
|
|
|
@ -21,6 +21,7 @@ urls = {
|
||||||
"emnist": "//www.nist.gov/itl/iad/image-group/emnist-dataset",
|
"emnist": "//www.nist.gov/itl/iad/image-group/emnist-dataset",
|
||||||
"fashion-mnist": "//github.com/zalandoresearch/fashion-mnist",
|
"fashion-mnist": "//github.com/zalandoresearch/fashion-mnist",
|
||||||
"mnist": "http://yann.lecun.com/exdb/mnist/",
|
"mnist": "http://yann.lecun.com/exdb/mnist/",
|
||||||
|
"qmnist": "//github.com/facebookresearch/qmnist",
|
||||||
}
|
}
|
||||||
|
|
||||||
print(row.format(*headers))
|
print(row.format(*headers))
|
||||||
|
@ -31,7 +32,7 @@ for name in metadata.keys():
|
||||||
# print out the shape table for use in the README.
|
# print out the shape table for use in the README.
|
||||||
data = prepare(name)
|
data = prepare(name)
|
||||||
prefix = metadata[name][0]
|
prefix = metadata[name][0]
|
||||||
row_data = [f"[{prefix}][]"]
|
row_data = ["[{}][]".format(prefix)]
|
||||||
row_data += [name.replace("_", "\\_")]
|
row_data += [name.replace("_", "\\_")]
|
||||||
row_data += [str(array.shape) for array in data]
|
row_data += [str(array.shape) for array in data]
|
||||||
print(row.format(*row_data))
|
print(row.format(*row_data))
|
||||||
|
@ -39,4 +40,4 @@ for name in metadata.keys():
|
||||||
print()
|
print()
|
||||||
|
|
||||||
for anchor, url in urls.items():
|
for anchor, url in urls.items():
|
||||||
print(f"[{anchor}]: {url}")
|
print("[{}]: {}".format(anchor, url))
|
||||||
|
|
|
@ -5,9 +5,9 @@ class IntegrityError(Exception):
|
||||||
self.computed_hash = computed_hash
|
self.computed_hash = computed_hash
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return f"""Failed to validate dataset: {name}
|
return """Failed to validate dataset: {self.file}
|
||||||
Hash mismatch: {self.computed_hash} should be {self.expected_hash}
|
Hash mismatch: {self.computed_hash} should be {self.expected_hash}
|
||||||
Please check your local file for tampering or corruption."""
|
Please check your local file for tampering or corruption.""".format(self=self)
|
||||||
|
|
||||||
|
|
||||||
class UnknownDatasetError(Exception):
|
class UnknownDatasetError(Exception):
|
||||||
|
@ -15,4 +15,4 @@ class UnknownDatasetError(Exception):
|
||||||
self.dataset = dataset
|
self.dataset = dataset
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return f"Unknown mnist-like dataset: {dataset}"
|
return "Unknown mnist-like dataset: {self.dataset}".format(self=self)
|
||||||
|
|
|
@ -63,4 +63,12 @@ hashes = {
|
||||||
'440fcabf73cc546fa21475e81ea370265605f56be210a4024d2ca8f203523609',
|
'440fcabf73cc546fa21475e81ea370265605f56be210a4024d2ca8f203523609',
|
||||||
'mnist/train-labels-idx1-ubyte.gz':
|
'mnist/train-labels-idx1-ubyte.gz':
|
||||||
'3552534a0a558bbed6aed32b30c495cca23d567ec52cac8be1a0730e8010255c',
|
'3552534a0a558bbed6aed32b30c495cca23d567ec52cac8be1a0730e8010255c',
|
||||||
|
'qmnist/qmnist-test-images-idx3-ubyte.gz':
|
||||||
|
'43fc22bf7498b8fc98de98369d72f752d0deabc280a43a7bcc364ab19e57b375',
|
||||||
|
'qmnist/qmnist-test-labels-idx1-ubyte.gz':
|
||||||
|
'3f384004f51536c2a29f2ce4d36388ee6cb8fff45bc3ad0cc588a86f2cc76375',
|
||||||
|
'qmnist/qmnist-train-images-idx3-ubyte.gz':
|
||||||
|
'9e26a7bf1683614e065d7b76460ccd52807165b3f22561fb782bd9f38c52b51d',
|
||||||
|
'qmnist/qmnist-train-labels-idx1-ubyte.gz':
|
||||||
|
'3552534a0a558bbed6aed32b30c495cca23d567ec52cac8be1a0730e8010255c',
|
||||||
}
|
}
|
||||||
|
|
4
setup.py
4
setup.py
|
@ -2,7 +2,7 @@ from setuptools import setup
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name='mnists',
|
name='mnists',
|
||||||
version='0.3.0',
|
version='0.4.0',
|
||||||
packages=[
|
packages=[
|
||||||
'mnists',
|
'mnists',
|
||||||
],
|
],
|
||||||
|
@ -22,7 +22,7 @@ setup(
|
||||||
'Natural Language :: English',
|
'Natural Language :: English',
|
||||||
'Programming Language :: Python',
|
'Programming Language :: Python',
|
||||||
'Programming Language :: Python :: 3',
|
'Programming Language :: Python :: 3',
|
||||||
'Programming Language :: Python :: 3.6',
|
'Programming Language :: Python :: 3.5',
|
||||||
'Topic :: Scientific/Engineering',
|
'Topic :: Scientific/Engineering',
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in a new issue