|
| 1 | +import contextlib |
1 | 2 | import warnings |
2 | 3 | from abc import ABC, abstractmethod |
3 | 4 | from collections.abc import Iterator |
4 | 5 | from io import BytesIO |
5 | 6 | from pathlib import Path |
| 7 | +from pathlib import PurePosixPath as ObjectPath |
6 | 8 |
|
7 | 9 | import boto3 |
8 | 10 | from botocore.exceptions import ClientError |
9 | 11 | from google.cloud.exceptions import NotFound |
10 | 12 | from google.cloud.storage import Blob, Bucket |
| 13 | +from obstore.store import ObjectStore |
11 | 14 |
|
12 | 15 |
|
13 | 16 | class JobCache(ABC): |
@@ -62,6 +65,53 @@ def group(self, key: str) -> "NoCache": |
62 | 65 | return self |
63 | 66 |
|
64 | 67 |
|
| 68 | +class ObstoreCache(JobCache): |
| 69 | + def __init__(self, store: ObjectStore, prefix: str | ObjectPath = ObjectPath(".")) -> None: |
| 70 | + """A cache implementation backed by an obstore ObjectStore. |
| 71 | +
|
| 72 | + This cache implementation is the recommended way of working with the cache, as it provides a unified interface |
| 73 | + for working with different object stores, while also providing a way to transparently work with local files |
| 74 | + as well. |
| 75 | +
|
| 76 | + Args: |
| 77 | + store: The object store to use for the cache. |
| 78 | + prefix: A path prefix to append to all objects stored in the cache. Defaults to no prefix. |
| 79 | + """ |
| 80 | + self.store = store |
| 81 | + self.prefix = ObjectPath(prefix) |
| 82 | + |
| 83 | + def __contains__(self, key: str) -> bool: |
| 84 | + with contextlib.suppress(OSError): |
| 85 | + self.store.get(str(self.prefix / key)) |
| 86 | + return True # if get is successful, we know the key is in the cache |
| 87 | + |
| 88 | + return False |
| 89 | + |
| 90 | + def __setitem__(self, key: str, value: bytes) -> None: |
| 91 | + self.store.put(str(self.prefix / key), value) |
| 92 | + |
| 93 | + def __delitem__(self, key: str) -> None: |
| 94 | + try: |
| 95 | + self.store.delete(str(self.prefix / key)) |
| 96 | + except OSError: |
| 97 | + raise KeyError(f"{key} is not cached!") from None |
| 98 | + |
| 99 | + def __getitem__(self, key: str) -> bytes: |
| 100 | + try: |
| 101 | + entry = self.store.get(str(self.prefix / key)) |
| 102 | + return bytes(entry.bytes()) |
| 103 | + except OSError: |
| 104 | + raise KeyError(f"{key} is not cached!") from None |
| 105 | + |
| 106 | + def __iter__(self) -> Iterator[str]: |
| 107 | + for obj in self.store.list_with_delimiter(str(self.prefix))["objects"]: |
| 108 | + path: str = obj["path"] |
| 109 | + yield path.removeprefix(str(self.prefix) + "/") |
| 110 | + |
| 111 | + def group(self, key: str) -> "ObstoreCache": |
| 112 | + return ObstoreCache(self.store, prefix=str(self.prefix / key)) |
| 113 | + |
| 114 | + |
65 | 115 | class InMemoryCache(JobCache): |
66 | 116 | def __init__(self) -> None: |
67 | 117 | """A simple in-memory cache implementation. |
@@ -153,7 +203,7 @@ def __init__(self, root: Path | str = Path("cache")) -> None: |
153 | 203 | Args: |
154 | 204 | root: File system path where the cache will be stored. Defaults to "cache" in the current working directory. |
155 | 205 | """ |
156 | | - self.root = root if isinstance(root, Path) else Path(root) |
| 206 | + self.root = Path(root) |
157 | 207 |
|
158 | 208 | def __contains__(self, key: str) -> bool: |
159 | 209 | return (self.root / key).exists() |
@@ -184,15 +234,17 @@ def group(self, key: str) -> "LocalFileSystemCache": |
184 | 234 |
|
185 | 235 |
|
186 | 236 | class GoogleStorageCache(JobCache): |
187 | | - def __init__(self, bucket: Bucket, prefix: str = "jobs") -> None: |
| 237 | + def __init__(self, bucket: Bucket, prefix: str | ObjectPath = "jobs") -> None: |
188 | 238 | """A cache implementation that stores data in Google Cloud Storage. |
189 | 239 |
|
190 | 240 | Args: |
191 | 241 | bucket: The Google Cloud Storage bucket to use for the cache. |
192 | 242 | prefix: A path prefix to append to all objects stored in the cache. Defaults to "jobs". |
193 | 243 | """ |
194 | 244 | self.bucket = bucket |
195 | | - self.prefix = Path(prefix) # we still use pathlib here, because it's easier to work with when joining paths |
| 245 | + self.prefix = ObjectPath( |
| 246 | + prefix |
| 247 | + ) # we still use pathlib here, because it's easier to work with when joining paths |
196 | 248 |
|
197 | 249 | def _blob(self, key: str) -> Blob: |
198 | 250 | return self.bucket.blob(str(self.prefix / key)) |
@@ -228,22 +280,22 @@ def __iter__(self) -> Iterator[str]: |
228 | 280 |
|
229 | 281 | # make the names relative to the cache prefix (but including the key in the name) |
230 | 282 | for blob in blobs: |
231 | | - yield str(Path(blob.name).relative_to(self.prefix)) |
| 283 | + yield str(ObjectPath(blob.name).relative_to(self.prefix)) |
232 | 284 |
|
233 | 285 | def group(self, key: str) -> "GoogleStorageCache": |
234 | 286 | return GoogleStorageCache(self.bucket, prefix=str(self.prefix / key)) |
235 | 287 |
|
236 | 288 |
|
237 | 289 | class AmazonS3Cache(JobCache): |
238 | | - def __init__(self, bucket: str, prefix: str = "jobs") -> None: |
| 290 | + def __init__(self, bucket: str, prefix: str | ObjectPath = "jobs") -> None: |
239 | 291 | """A cache implementation that stores data in Amazon S3. |
240 | 292 |
|
241 | 293 | Args: |
242 | 294 | bucket: The Amazon S3 bucket to use for the cache. |
243 | 295 | prefix: A path prefix to append to all objects stored in the cache. Defaults to "jobs". |
244 | 296 | """ |
245 | 297 | self.bucket = bucket |
246 | | - self.prefix = Path(prefix) |
| 298 | + self.prefix = ObjectPath(prefix) |
247 | 299 | with warnings.catch_warnings(): |
248 | 300 | # https://github.com/boto/boto3/issues/3889 |
249 | 301 | warnings.filterwarnings("ignore", category=DeprecationWarning, message=".*datetime.utcnow.*") |
|
0 commit comments