Coverage for src/bob/io/base/__init__.py: 84%
114 statements
« prev ^ index » next coverage.py v7.0.5, created at 2023-06-16 13:56 +0200
« prev ^ index » next coverage.py v7.0.5, created at 2023-06-16 13:56 +0200
1# import Libraries of other lib packages
2import logging
4import h5py
5import imageio
6import numpy as np
8logger = logging.getLogger(__name__)
9import os
11# Allowing the loading of truncated files in case PIL is used
12# https://github.com/kirumang/Pix2Pose/issues/2
13from PIL import ImageFile
15ImageFile.LOAD_TRUNCATED_IMAGES = True
18hdf5_extensions = [".hdf5", ".h5", ".hdf", ".hdf5", ".h5", ".hdf", ".hdf5"]
19image_extensions = [
20 ".jpg",
21 ".jpeg",
22 ".png",
23 ".bmp",
24 ".gif",
25 ".tif",
26 ".tiff",
27 ".pgm",
28 ".pbm",
29 ".pnm",
30 ".ppm",
31]
34def _is_string(s):
35 """Returns ``True`` if the given object is a string or bytes."""
36 return isinstance(s, (bytes, str))
39@np.deprecate(new_name="os.makedirs(directory, exist_ok=True)")
40def create_directories_safe(directory, dryrun=False):
41 """Creates a directory if it does not exists, with concurrent access
42 support. This function will also create any parent directories that might
43 be required. If the dryrun option is selected, it does not actually create
44 the directory, but just writes the (Linux) command that would have been
45 executed.
47 **Parameters:**
49 ``directory`` : str
50 The directory that you want to create.
52 ``dryrun`` : bool
53 Only ``print`` the command to console, but do not execute it.
54 """
55 if dryrun:
56 print("[dry-run] mkdir -p '%s'" % directory)
57 else:
58 os.makedirs(directory, exist_ok=True)
61def open_file(filename) -> np.ndarray:
62 """Reads a file content.
64 Parameters
65 ----------
67 ``filename`` : str
68 The name of the file to open.
69 """
71 def check_gray(img):
72 # Checking for gray scaled images
73 if (
74 img.ndim > 2
75 and np.array_equal(img[:, :, 0], img[:, :, 1])
76 and np.array_equal(img[:, :, 0], img[:, :, 2])
77 ):
78 img = img[:, :, 0]
79 return img
81 # get the extension
82 extension = os.path.splitext(filename)[1].lower()
84 if extension in hdf5_extensions:
85 with h5py.File(filename, "r") as f:
86 keys = list(f.keys())
87 if len(keys) == 1:
88 key = keys[0]
89 else:
90 key = "array"
91 if key not in keys:
92 raise RuntimeError(
93 f"The file {filename} does not contain the key {key}"
94 )
95 dataset = f[key]
96 # if the data was saved as a string, load it back as string
97 string_dtype = h5py.check_string_dtype(dataset.dtype)
98 if string_dtype is not None:
99 dataset = dataset.asstr()
100 return dataset[()]
102 elif extension in image_extensions:
103 from ..image import to_bob
105 img = imageio.imread(filename)
107 # PNGs have a 4th channel, which we don't want
108 # Alpha channels for instance have to be ignored
109 if img.ndim > 2:
110 if extension.lower() == ".png":
111 img = img[:, :, 0:3]
113 img = check_gray(img)
114 return img if img.ndim == 2 else to_bob(img)
115 else:
116 raise ValueError(f"Unknown file extension: {extension}")
119def write_file(filename, data, format="pillow") -> None:
120 """Writes the contents of a :py:class:`numpy.ndarray` to a file.
122 Parameters
123 ----------
125 ``filename`` : str
126 The name of the file to write to.
128 ``data`` : :py:class:`numpy.ndarray`
129 The data to write to the file.
131 ``format`` : str
132 The format to use to read the file. By default imageio selects the appropriate for you based on the filename and its contents
133 """
134 extension = os.path.splitext(filename)[1] # get the extension
136 if extension in hdf5_extensions:
137 with h5py.File(filename, "w") as f:
138 f["array"] = data
139 elif extension in image_extensions:
140 # Pillow is the format with the best support for all image formats
141 from ..image import to_matplotlib
143 imageio.imwrite(filename, to_matplotlib(data), format=format)
144 else:
145 raise RuntimeError(f"Unknown file extension: {extension}")
148def load(inputs) -> np.ndarray:
149 """Loads the content of a file.
151 Will take a filename (or an iterable of filenames) and put the content into a
152 :py:class:`numpy.ndarray`.
154 **Parameters:**
156 ``inputs`` : various types
158 This might represent several different entities:
160 1. The name of a file (full path) from where to load the data. In this
161 case, this assumes that the file contains an array and returns a loaded
162 numpy ndarray.
163 2. An iterable of filenames to be loaded in memory. In this case, this
164 would assume that each file contains a single 1D sample or a set of 1D
165 samples, load them in memory and concatenate them into a single and
166 returned 2D :py:class:`numpy.ndarray`.
168 **Returns:**
170 ``data`` : :py:class:`numpy.ndarray`
171 The data loaded from the given ``inputs``.
172 """
173 from collections.abc import Iterable
175 import numpy
177 if _is_string(inputs):
178 if not os.path.exists(inputs):
179 raise RuntimeError(f"`{inputs}' does not exist!")
180 try:
181 return open_file(inputs)
182 except Exception as e:
183 raise RuntimeError(f"Could not load `{inputs}'!") from e
185 elif isinstance(inputs, Iterable):
186 retval = []
187 for obj in inputs:
188 if _is_string(obj):
189 retval.append(load(obj))
190 else:
191 raise TypeError(
192 "Iterable contains an object which is not a filename"
193 )
194 return numpy.vstack(retval)
195 else:
196 raise TypeError(
197 "Unexpected input object. This function is expecting a filename, "
198 "or an iterable of filenames."
199 )
202def save(array, filename, create_directories=False):
203 """Saves the contents of an array-like object to file.
205 Effectively, this is the same as opening a file with the mode flag set to ``'w'``
206 (write with truncation) and calling ``file.write`` passing ``array`` as parameter.
208 Parameters:
210 ``array`` : array_like
211 The array-like object to be saved on the file
213 ``filename`` : str
214 The name of the file where you need the contents saved to
216 ``create_directories`` : bool
217 Automatically generate the directories if required (defaults to ``False``
218 because of compatibility reasons; might change in future to default to
219 ``True``)
220 """
221 # create directory if not existent yet
222 if create_directories:
223 create_directories_safe(os.path.dirname(filename))
225 # if array is a string, don't create a numpy array
226 if not isinstance(array, str):
227 # requires data is c-contiguous and aligned, will create a copy otherwise
228 array = np.require(array, requirements=("C_CONTIGUOUS", "ALIGNED"))
230 write_file(filename, array)
233# Just to make it homogenous with the C++ API
234write = save
235read = load
238# Keeps compatibility with the previously existing API
239# open = File
242def _generate_features(reader, paths, same_size=False):
243 """Load and stack features in a memory efficient way. This function is
244 meant to be used inside :py:func:`vstack_features`.
246 Parameters
247 ----------
248 reader : ``collections.Callable``
249 See the documentation of :py:func:`vstack_features`.
250 paths : ``collections.Iterable``
251 See the documentation of :py:func:`vstack_features`.
252 same_size : :obj:`bool`, optional
253 See the documentation of :py:func:`vstack_features`.
255 Yields
256 ------
257 object
258 The first object returned is a tuple of :py:class:`numpy.dtype` of
259 features and the shape of the first feature. The rest of objects are
260 the actual values in features. The features are returned in C order.
261 """
262 shape_determined = False
263 for i, path in enumerate(paths):
265 feature = np.atleast_2d(reader(path))
266 feature = np.ascontiguousarray(feature)
267 if not shape_determined:
268 shape_determined = True
269 dtype = feature.dtype
270 shape = list(feature.shape)
271 yield (dtype, shape)
272 else:
273 # make sure all features have the same shape and dtype
274 if same_size:
275 assert shape == list(
276 feature.shape
277 ), f"Expected feature shape of {shape}, got {feature.shape}"
278 else:
279 assert shape[1:] == list(
280 feature.shape[1:]
281 ), f"Ignoring first dimension, expected feature shape of {shape}, got {feature.shape}"
282 assert dtype == feature.dtype
284 if same_size:
285 yield (feature.ravel(),)
286 else:
287 for feat in feature:
288 yield (feat.ravel(),)
291def vstack_features(reader, paths, same_size=False, dtype=None):
292 """Stacks all features in a memory efficient way.
294 Parameters
295 ----------
296 reader : ``collections.Callable``
297 The function to load the features. The function should only take one
298 argument ``path`` and return loaded features. Use :any:`functools.partial`
299 to accommodate your reader to this format.
300 The features returned by ``reader`` are expected to have the same
301 :py:class:`numpy.dtype` and the same shape except for their first
302 dimension. First dimension should correspond to the number of samples.
303 paths : ``collections.Iterable``
304 An iterable of paths to iterate on. Whatever is inside path is given to
305 ``reader`` so they do not need to be necessarily paths to actual files.
306 If ``same_size`` is ``True``, ``len(paths)`` must be valid.
307 same_size : :obj:`bool`, optional
308 If ``True``, it assumes that arrays inside all the paths are the same
309 shape. If you know the features are the same size in all paths, set this
310 to ``True`` to improve the performance.
311 dtype : :py:class:`numpy.dtype`, optional
312 If provided, the data will be casted to this format.
314 Returns
315 -------
316 numpy.ndarray
317 The read features with the shape ``(n_samples, *features_shape[1:])``.
319 Examples
320 --------
321 This function in a simple way is equivalent to calling
322 ``numpy.vstack([reader(p) for p in paths])``.
324 >>> import numpy
325 >>> from bob.io.base import vstack_features
326 >>> def reader(path):
327 ... # in each file, there are 5 samples and features are 2 dimensional.
328 ... return numpy.arange(10).reshape(5,2)
329 >>> paths = ['path1', 'path2']
330 >>> all_features = vstack_features(reader, paths)
331 >>> numpy.allclose(all_features, numpy.array(
332 ... [[0, 1],
333 ... [2, 3],
334 ... [4, 5],
335 ... [6, 7],
336 ... [8, 9],
337 ... [0, 1],
338 ... [2, 3],
339 ... [4, 5],
340 ... [6, 7],
341 ... [8, 9]]))
342 True
343 >>> all_features_with_more_memory = numpy.vstack([reader(p) for p in paths])
344 >>> numpy.allclose(all_features, all_features_with_more_memory)
345 True
347 You can allocate the array at once to improve the performance if you know
348 that all features in paths have the same shape and you know the total number
349 of the paths:
351 >>> all_features = vstack_features(reader, paths, same_size=True)
352 >>> numpy.allclose(all_features, numpy.array(
353 ... [[0, 1],
354 ... [2, 3],
355 ... [4, 5],
356 ... [6, 7],
357 ... [8, 9],
358 ... [0, 1],
359 ... [2, 3],
360 ... [4, 5],
361 ... [6, 7],
362 ... [8, 9]]))
363 True
364 """
365 iterable = _generate_features(reader, paths, same_size)
366 data_dtype, shape = next(iterable)
367 if dtype is None:
368 dtype = data_dtype
369 if same_size:
370 # numpy black magic: https://stackoverflow.com/a/12473478/1286165
371 field_dtype = [("", (dtype, (np.prod(shape),)))]
372 total_size = len(paths)
373 all_features = np.fromiter(iterable, field_dtype, total_size)
374 else:
375 field_dtype = [("", (dtype, (np.prod(shape[1:]),)))]
376 all_features = np.fromiter(iterable, field_dtype)
378 # go from a field array to a normal array
379 all_features = all_features.view(dtype)
380 # the shape is assumed to be (n_samples, ...) it can be (5, 2) or (5, 3, 4).
381 shape = list(shape)
382 shape[0] = -1
383 return np.reshape(all_features, shape, order="C")
386# gets sphinx autodoc done right - don't remove it
387__all__ = [_ for _ in dir() if not _.startswith("_")]