8000 More Manifest Info by bbengfort · Pull Request #20 · rotationalio/construe · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

More Manifest Info #20

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,5 @@ jobs:

- name: Run Tests
run: pytest
env:
CONSTRUE_TEST_SAMPLE_LOADERS: 1
29 changes: 27 additions & 2 deletions construe/cloud/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os
import json
import glob
import zipfile

from urllib.parse import urljoin

Expand All @@ -24,28 +25,52 @@ def load_manifest(path):
return json.load(f)


def generate_manifest(fixtures, out, upload_type):
def generate_manifest(fixtures, out, upload_type, extra=None):
manifest = {}
version = get_version(short=True)

for path in glob.glob(os.path.join(fixtures, "*.zip")):
# Sort the list of paths by name
paths = list(glob.glob(os.path.join(fixtures, "*.zip")))
paths.sort()

for path in paths:
fname = os.path.basename(path)
name, _ = os.path.splitext(fname)

manifest[name] = {
"url": make_fixture_url(fname, upload_type=upload_type, version=version),
"signature": sha256sum(path),
"size": {
"compressed": os.path.getsize(path),
"decompressed": get_uncompressed_size(path),
},
}

if extra is not None:
if callable(extra):
manifest[name].update(extra(path=path, name=name, **manifest[name]))
else:
manifest[name].update(extra)

with open(out, "w") as o:
json.dump(manifest, o, indent=2)


def make_fixture_url(fname, upload_type, version=None):
# Bucket must be joined here and not make_fixture_path to support uploading
path = make_fixture_path(fname, upload_type, version)
path = os.path.join(BUCKET, path)
return urljoin(BASE_URL, path)


def make_fixture_path(fname, upload_type, version=None):
version = version or get_version(short=True)
return os.path.join(f"v{version}", upload_type, fname)


def get_uncompressed_size(path: str) -> int:
bytes = 0
with zipfile.ZipFile(path, 'r') as zf:
for info in zf.infolist():
bytes += info.file_size
return bytes
7 changes: 6 additions & 1 deletion construe/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,10 @@
"""

from .loaders import * # noqa
from .download import download_data
from .download import download_data, load_manifest
from .path import get_data_home, cleanup_dataset

try:
DATASETS = load_manifest()
except Exception:
DATASETS = None
8 changes: 5 additions & 3 deletions construe/datasets/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,15 @@ def _download_dataset(
raise DatasetsError(f"no dataset named {name} exists")

info = datasets[name]
info.update({
kwargs = {
"data_home": data_home,
"replace": replace,
"extract": extract,
"progress": progress,
})
download_data(**info)
"url": info["url"],
"signature": info["signature"],
}
download_data(**kwargs)


download_dialects = partial(_download_dataset, DIALECTS)
Expand Down
15 changes: 9 additions & 6 deletions construe/datasets/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,11 @@ def _load_prepare(name, sample=True, data_home=None):
info = _info(name)
if not dataset_archive(name, info["signature"], data_home=data_home):
# If the dataset does not exist, download and extract it
info.update({"data_home": data_home, "replace": True, "extract": True})
download_data(**info)
kwargs = {
"data_home": data_home, "replace": True, "extract": True,
"url": info["url"], "signature": info["signature"],
}
download_data(**kwargs)

return find_dataset_path(name, data_home=data_home, fname=None, ext=None)

Expand All @@ -56,8 +59,8 @@ def _load_file_dataset(name, sample=True, data_home=None, no_dirs=True, pattern=
data_path = _load_prepare(name, sample=sample, data_home=data_home)

# Glob pattern for discovering files in the dataset
if pattern is not None:
pattern = os.path.join(data_path, name, "**", "*")
if pattern is None:
pattern = os.path.join(data_path, "**", "*")
else:
pattern = os.path.join(data_path, pattern)

Expand All @@ -73,7 +76,7 @@ def _load_jsonl_dataset(name, sample=True, data_home=None):
for path in glob.glob(os.path.join(data_path, "*.jsonl")):
with open(path, "r") as f:
for line in f:
yield json.load(f)
yield json.loads(line.strip())


def _cleanup_dataset(name, sample=True, data_home=None):
Expand All @@ -100,7 +103,7 @@ def _cleanup_dataset(name, sample=True, data_home=None):
load_aegis = partial(_load_jsonl_dataset, AEGIS)
cleanup_aegis = partial(_cleanup_dataset, AEGIS)

load_nsfw = partial(_load_file_dataset, NSFW)
load_nsfw = partial(_load_file_dataset, NSFW, pattern="nsfw/**/*.jpg")
cleanup_nsfw = partial(_cleanup_dataset, NSFW)


Expand Down
188 changes: 153 additions & 35 deletions construe/datasets/manifest.json
Original file line number Diff line number Diff line change
@@ -1,58 +1,176 @@
{
"dialects": {
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/dialects.zip",
"signature": "0e6767047e05f618560d097dfa0587530636c52fc19507c087bdff556b389489"
"aegis-sample": {
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/aegis-sample.zip",
"signature": "a2b3ae9c5a19833cc594fc4c14a6bfce35ab9c6086f0c2836d2719ab788119bd",
"size": {
"compressed": 916334,
"decompressed": 2878359
},
"instances": 3030
},
"lowlight": {
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/lowlight.zip",
"signature": "ddc36eb7f0443efa5e71939e503d0834fd48451281d9658d5cb7ead30143b98f"
"aegis": {
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/aegis.zip",
"signature": "c846f20d893461525839cd2f61f85faf0dcbff03e1998fd8f747506ff65bec69",
"size": {
"compressed": 3619910,
"decompressed": 11362916
},
"instances": 11997
},
"dialects-sample": {
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/dialects-sample.zip",
"signature": "9e9509f4d82468c896bede36b16c6de218a1dce28a56ae49d1fb75933bf770c5"
},
"reddit": {
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/reddit.zip",
"signature": "d97419403f0d940970b2542d5b188570dacedae3c2a68ada3520cfa95c52f75c"
"signature": "9e9509f4d82468c896bede36b16c6de218a1dce28a56ae49d1fb75933bf770c5",
"size": {
"compressed": 243136640,
"decompressed": 356704802
},
"instances": 1785,
"classes": {
"northern_male": 203,
"southern_female": 417,
"northern_female": 65,
"irish_male": 48,
"scottish_male": 172,
"welsh_female": 120,
"southern_male": 436,
"midlands_female": 24,
"midlands_male": 51,
"welsh_male": 157,
"scottish_female": 92
}
},
"movies-sample": {
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/movies-sample.zip",
"signature": "2d3d9294ad875e7489db94fc2ab02c1ad6dfdc15a2bf1a5037be36a6defc8168"
"dialects": {
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/dialects.zip",
"signature": "0e6767047e05f618560d097dfa0587530636c52fc19507c087bdff556b389489",
"size": {
"compressed": 2466918919,
"decompressed": 3605272328
},
"instances": 17877,
"classes": {
"northern_male": 2097,
"southern_female": 4161,
"northern_female": 750,
"irish_male": 450,
"scottish_male": 1649,
"welsh_female": 1199,
"southern_male": 4331,
"midlands_female": 246,
"midlands_male": 450,
"welsh_male": 1650,
"scottish_female": 894
}
},
"essays-sample": {
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/essays-sample.zip",
"signature": "a77fc1c2c2718d79132598e6c873fd5b08c40c2e4049d995317747fb76b96631"
},
"aegis-sample": {
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/aegis-sample.zip",
"signature": "a2b3ae9c5a19833cc594fc4c14a6bfce35ab9c6086f0c2836d2719ab788119bd"
},
"aegis": {
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/aegis.zip",
"signature": "c846f20d893461525839cd2f61f85faf0dcbff03e1998fd8f747506ff65bec69"
},
"nsfw-sample": {
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/nsfw-sample.zip",
"signature": "d5044f30769d3a6e9ba639312120dc955bdfcf4d8aa8a6f3ee493334644b9fcd"
"signature": "a77fc1c2c2718d79132598e6c873fd5b08c40c2e4049d995317747fb76b96631",
"size": {
"compressed": 1796330,
"decompressed": 8785856
},
"instances": 512
},
"essays": {
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/essays.zip",
"signature": "3a7b260dd5baec9134c7398ac7b9b297d7b1a387bce1a9f99cd8d3e0a7ceb9cc"
},
"reddit-sample": {
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/reddit-sample.zip",
"signature": "24088c648b8c3497d0b682102c3fa965d46ca22abe8f94695287e09bf82db991"
"signature": "3a7b260dd5baec9134c7398ac7b9b297d7b1a387bce1a9f99cd8d3e0a7ceb9cc",
"size": {
"compressed": 7116584,
"decompressed": 35516576
},
"instances": 2078
},
"lowlight-sample": {
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/lowlight-sample.zip",
"signature": "f34bafa588441b8e240b0932e9ac446d9f805bdfdb22640c036c441258220eaf"
"signature": "f34bafa588441b8e240b0932e9ac446d9f805bdfdb22640c036c441258220eaf",
"size": {
"compressed": 166217847,
"decompressed": 166608858
},
"instances": 475,
"classes": {
"high": 242,
"low": 233
}
},
"lowlight": {
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/lowlight.zip",
"signature": "ddc36eb7f0443efa5e71939e503d0834fd48451281d9658d5cb7ead30143b98f",
"size": {
"compressed": 347470078,
"decompressed": 348256471
},
"instances": 1000,
"classes": {
"high": 500,
"low": 500
}
},
"movies-sample": {
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/movies-sample.zip",
"signature": "2d3d9294ad875e7489db94fc2ab02c1ad6dfdc15a2bf1a5037be36a6defc8168",
"size": {
"compressed": 381174092,
"decompressed": 387776108
},
"instances": 5465,
"classes": {
"movies": 5465
}
},
"movies": {
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/movies.zip",
"signature": "618f7aa8aa103192ee8b76fc701ff182b2a41e5e78675a4d6af707e490d36f45"
"signature": "618f7aa8aa103192ee8b76fc701ff182b2a41e5e78675a4d6af707e490d36f45",
"size": {
"compressed": 7351355869,
"decompressed": 7479027563
},
"instances": 106844,
"classes": {
"movies": 106844
}
},
"nsfw-sample": {
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/nsfw-sample.zip",
"signature": "d5044f30769d3a6e9ba639312120dc955bdfcf4d8aa8a6f3ee493334644b9fcd",
"size": {
"compressed& AACC quot;: 6429140,
"decompressed": 6535438
},
"instances": 53,
"classes": {
"safe": 28,
"nsfw": 25
}
},
"nsfw": {
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/nsfw.zip",
"signature": "7ac498e8f17428c51a5c8c366aaf10b47663a9eb8a560fd8abe01366eaf60139"
"signature": "7ac498e8f17428c51a5c8c366aaf10b47663a9eb8a560fd8abe01366eaf60139",
"size": {
"compressed": 27937058,
"decompressed": 28266876
},
"instances": 215,
"classes": {
"safe": 108,
"nsfw": 107
}
},
"reddit-sample": {
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/reddit-sample.zip",
"signature": "24088c648b8c3497d0b682102c3fa965d46ca22abe8f94695287e09bf82db991",
"size": {
"compressed": 63979,
"decompressed": 278734
},
"instances": 957
},
"reddit": {
"url": "https://storage.googleapis.com/construe/v0.3.0/datasets/reddit.zip",
"signature": "d97419403f0d940970b2542d5b188570dacedae3c2a68ada3520cfa95c52f75c",
"size": {
"compressed": 244363,
"decompressed": 1117785
},
"instances": 3844
}
}
Loading
Loading
0