dataset_parser.h
Go to the documentation of this file.
1 /*
2 * ECVL - European Computer Vision Library
3 * Version: 1.0.0
4 * copyright (c) 2021, Università degli Studi di Modena e Reggio Emilia (UNIMORE), AImageLab
5 * Authors:
6 * Costantino Grana (costantino.grana@unimore.it)
7 * Federico Bolelli (federico.bolelli@unimore.it)
8 * Michele Cancilla (michele.cancilla@unimore.it)
9 * Laura Canalini (laura.canalini@unimore.it)
10 * Stefano Allegretti (stefano.allegretti@unimore.it)
11 * All rights reserved.
12 */
13 
14 #ifndef ECVL_DATASET_PARSER_H_
15 #define ECVL_DATASET_PARSER_H_
16 
17 #include "ecvl/core.h"
18 #include "ecvl/core/any.h"
19 #include "ecvl/core/filesystem.h"
20 #include "ecvl/core/optional.h"
21 
22 #include <iostream>
23 #include <iterator>
24 #include <map>
25 #include <vector>
26 #include <regex>
27 
28 #include "yaml-cpp/yaml.h"
29 
30 // This allows to define strongly typed enums and convert them to int with just a + in front
31 #define UNSIGNED_ENUM_CLASS(name, ...) enum class name : unsigned { __VA_ARGS__ };\
32 inline constexpr unsigned operator+ (name const val) { return static_cast<unsigned>(val); }
33 
34 namespace ecvl
35 {
40 UNSIGNED_ENUM_CLASS(SplitType, training, validation, test)
41 
42 
46 enum class Task
47 {
50 };
51 
58 class Sample
59 {
60 public:
61  std::vector<filesystem::path> location_;
65  std::vector<int> size_;
76  ecvl::Image LoadImage(ecvl::ColorType ctype = ecvl::ColorType::RGB, const bool& is_gt = false);
77 };
78 
84 class Split
85 {
86 public:
87  std::string split_name_;
89  std::vector<int> samples_indices_;
90  bool drop_last_ = false;
93  bool no_label_ = false;
95  Split() = default;
96 
103  Split(const std::string& split_name, const std::vector<int>& samples_indices, const bool drop_last = false, const bool no_label = false)
104  : split_name_{ split_name }, samples_indices_{ samples_indices }, drop_last_{ drop_last }, no_label_{ no_label }
105  {
106  if (split_name_ == "training") split_type_ = SplitType::training;
107  else if (split_name_ == "validation") split_type_ = SplitType::validation;
108  else if (split_name_ == "test") split_type_ = SplitType::test;
109  }
110 
111  void SetNumBatches(int batch_size)
112  {
113  num_batches_ = drop_last_ ? vsize(samples_indices_) / batch_size : (vsize(samples_indices_) + batch_size - 1) / batch_size;
114  }
115 
116  void SetLastBatch(int batch_size)
117  {
118  // last batch is the remainder of the number of samples of the split divided by the batch size.
119  // if drop last is true or the remainder is 0, last batch is equal to the batch size.
120  auto value = vsize(samples_indices_) % batch_size;
121  last_batch_ = drop_last_ ? batch_size : (value == 0 ? batch_size : value);
122  }
123 };
124 
131 class Dataset
132 {
133  std::map<std::string, int> features_map_;
134  void DecodeImages(const YAML::Node& node, const filesystem::path& root_path, bool verify);
135  void FindLabel(Sample& sample, const YAML::Node& n);
136 protected:
137  std::vector<ecvl::Split>::iterator GetSplitIt(ecvl::any split);
138  const int GetSplitIndex(ecvl::any split);
139 public:
140  std::string name_ = "DeepHealth dataset";
141  std::string description_ = "This is the DeepHealth example dataset!";
142  std::vector<std::string> classes_;
143  std::vector<std::string> features_;
144  std::vector<Sample> samples_;
145  std::vector<Split> split_;
146  int current_split_ = -1;
149  Dataset() {}
150 
155  Dataset(const filesystem::path& filename, bool verify = false);
156 
157  /* Destructor */
158  virtual ~Dataset() {}
159 
166  std::vector<int>& GetSplit(const ecvl::any& split = -1);
167 
171  void SetSplit(const ecvl::any& split);
172 
180  void Dump(const filesystem::path& file_path);
181 
188  std::vector<std::vector<filesystem::path>> GetLocations() const;
189 
190  // RegEx which matchs URLs
191  static const std::regex url_regex_;
192 };
193 } // namespace ecvl
194 
195 #endif // ECVL_DATASET_PARSER_H_
std::vector< Split > split_
Splits of the Dataset. See Split.
Image class.
Definition: image.h:72
const int GetSplitIndex(ecvl::any split)
Split(const std::string &split_name, const std::vector< int > &samples_indices, const bool drop_last=false, const bool no_label=false)
void Dump(const filesystem::path &file_path)
Dump the Dataset into a YAML file following the DeepHealth Dataset Format.
optional< std::vector< int > > label_
Vector of sample labels.
std::vector< ecvl::Split >::iterator GetSplitIt(ecvl::any split)
virtual ~Dataset()
int vsize(const std::vector< T > &v)
Definition: image.h:34
std::vector< std::string > classes_
Vector with all the classes available in the Dataset.
std::string description_
Description of the Dataset.
ColorType
Enum class representing the ECVL supported color spaces.
Definition: image.h:50
std::vector< std::string > features_
Vector with all the features available in the Dataset.
std::vector< int > size_
Original x and y dimensions of the sample.
void SetLastBatch(int batch_size)
#define UNSIGNED_ENUM_CLASS(name,...)
std::vector< int > samples_indices_
Vector containing samples indices of the split.
optional< std::map< int, std::string > > values_
Map (map<feature-index,feature-value>) which stores the features of a sample.
std::vector< int > & GetSplit(const ecvl::any &split=-1)
Returns the image indexes of the requested split.
std::vector< std::vector< filesystem::path > > GetLocations() const
Retrieve the list of all samples locations in the dataset file.
Split()=default
optional< filesystem::path > label_path_
Absolute path of sample ground truth.
Definition: any.h:69
int current_split_
Current split from which images are loaded.
void SetSplit(const ecvl::any &split)
Set the current split.
std::vector< filesystem::path > location_
Absolute path of the sample.
Task task_
Task of the dataset.
ecvl::Image LoadImage(ecvl::ColorType ctype=ecvl::ColorType::RGB, const bool &is_gt=false)
Return an Image of the dataset.
void SetNumBatches(int batch_size)
Sample image in a dataset.
Split of a dataset. This class provides the name of the split and the indices of the samples that bel...
std::experimental::optional< T > optional
Definition: optional.h:72
optional< SplitType > split_type_
If the split is training, validation or test the corresponding SpitType is provided.
std::string name_
Name of the Dataset.
std::string split_name_
Name of the split.
std::experimental::any any
Definition: any.h:71
Task
Enum class representing allowed tasks for the ECVL Dataset.
int last_batch_
Dimension of the last batch of this split.
DeepHealth Dataset.
static const std::regex url_regex_
SplitType
Enum class representing the Dataset supported splits.
int num_batches_
Number of batches of this split.
std::vector< Sample > samples_
Vector containing all the Dataset samples. See Sample.
bool no_label_
Whether the split has samples with labels or not.
bool drop_last_
Whether to drop elements that don't fit batch size or not.