From f3a58a49943dcd8c2388e95a37729de5bd1220c5 Mon Sep 17 00:00:00 2001
From: Marek Otahal <markotahal@gmail.com>
Date: Fri, 20 Sep 2019 12:15:44 +0200
Subject: [PATCH] Classifier: use map to allow sparse categories

use map internally for categories_, instead of vector, which allows us
to have sparse {1,2,999} categories (=3 total). Instead, with vector
this would have to be 999 categories!
---
 src/htm/algorithms/SDRClassifier.cpp | 32 +++++++++++++++-------------
 src/htm/algorithms/SDRClassifier.hpp |  8 +++----
 2 files changed, 21 insertions(+), 19 deletions(-)
diff --git a/src/htm/algorithms/SDRClassifier.cpp b/src/htm/algorithms/SDRClassifier.cpp
index d731e1dccf..bb76289e2f 100644
--- a/src/htm/algorithms/SDRClassifier.cpp
+++ b/src/htm/algorithms/SDRClassifier.cpp
@@ -39,7 +39,7 @@ void Classifier::initialize(const Real alpha)
   NTA_CHECK(alpha > 0.0f);
   alpha_ = alpha;
   dimensions_ = 0;
-  numCategories_ = 0u;
+  categories_.clear();
   weights_.clear();
 }
 
@@ -47,15 +47,16 @@ void Classifier::initialize(const Real alpha)
 PDF Classifier::infer(const SDR & pattern) const {
   // Check input dimensions, or if this is the first time the Classifier is used and dimensions
   // are unset, return zeroes.
-  NTA_CHECK( dimensions_ != 0 )
+  NTA_CHECK( not categories_.empty() )
     << "Classifier: must call `learn` before `infer`.";
   NTA_ASSERT(pattern.size == dimensions_) << "Input SDR does not match previously seen size!";
 
   // Accumulate feed forward input.
-  PDF probabilities( numCategories_, 0.0f );
+  PDF probabilities( categories_.size(), 0.0f );
   for( const auto bit : pattern.getSparse() ) {
-    for( size_t i = 0; i < numCategories_; i++ ) {
-      probabilities[i] += weights_[bit][i];
+    for( size_t i=0u; i< categories_.size(); i++) {
+      const auto category = categories_.at(i);
+      probabilities[i] += weights_.at(bit).at(category); // needs .at() instead of [] because of the infer() const
     }
   }
 
@@ -72,19 +73,20 @@ void Classifier::learn(const SDR &pattern, const vector<UInt> &categoryIdxList)
   if( dimensions_ == 0 ) {
     dimensions_ = pattern.size;
     while( weights_.size() < pattern.size ) {
-      const auto initialEmptyWeights = PDF( numCategories_, 0.0f );
+      std::unordered_map<UInt, Real64> initialEmptyWeights;
       weights_.push_back( initialEmptyWeights );
     }
   }
   NTA_ASSERT(pattern.size == dimensions_) << "Input SDR does not match previously seen size!";
 
   // Check if this is a new category & resize the weights table to hold it.
-  const size_t maxCategoryIdx = *max_element(categoryIdxList.cbegin(), categoryIdxList.cend());
-  if( maxCategoryIdx >= numCategories_ ) {
-    numCategories_ = maxCategoryIdx + 1;
-    for( auto & vec : weights_ ) {
-      while( vec.size() < numCategories_ ) {
-        vec.push_back( 0.0f );
+  for (const auto cat: categoryIdxList) {
+    const bool alreadyInCategories = std::find(categories_.cbegin(), categories_.cend(), cat) != categories_.cend();
+    if( not alreadyInCategories ) {
+      categories_.push_back(cat);
+      //update existing inner weights: set new cat's weight to zero
+      for( auto & mapp : weights_ ) {
+        mapp.insert({cat, 0.0f});
       }
     }
   }
@@ -92,8 +94,8 @@ void Classifier::learn(const SDR &pattern, const vector<UInt> &categoryIdxList)
   // Compute errors and update weights.
   const auto& error = calculateError_(categoryIdxList, pattern);
   for( const auto& bit : pattern.getSparse() ) {
-    for(size_t i = 0u; i < numCategories_; i++) {
-      weights_[bit][i] += alpha_ * error[i];
+    for(const auto cat: categories_) {
+      weights_[bit][cat] += alpha_ * error[cat];
     }
   }
 }
@@ -106,7 +108,7 @@ std::vector<Real64> Classifier::calculateError_(const std::vector<UInt> &categor
   auto likelihoods = infer(pattern);
 
   // Compute target likelihoods
-  PDF targetDistribution(numCategories_ + 1u, 0.0f);
+  PDF targetDistribution(categories_.size() + 1u, 0.0f);
   for( size_t i = 0u; i < categoryIdxList.size(); i++ ) {
     targetDistribution[categoryIdxList[i]] = 1.0f / categoryIdxList.size();
   }
diff --git a/src/htm/algorithms/SDRClassifier.hpp b/src/htm/algorithms/SDRClassifier.hpp
index 99358d2d2d..bf566e081b 100644
--- a/src/htm/algorithms/SDRClassifier.hpp
+++ b/src/htm/algorithms/SDRClassifier.hpp
@@ -146,25 +146,25 @@ class Classifier : public Serializable
   {
     ar(cereal::make_nvp("alpha",         alpha_),
        cereal::make_nvp("dimensions",    dimensions_),
-       cereal::make_nvp("numCategories", numCategories_),
+       cereal::make_nvp("categories",    categories_),
        cereal::make_nvp("weights",       weights_));
   }
 
   template<class Archive>
   void load_ar(Archive & ar)
-    { ar( alpha_, dimensions_, numCategories_, weights_ ); }
+    { ar( alpha_, dimensions_, categories_, weights_ ); }
 
 private:
   Real alpha_;
   UInt dimensions_;
-  size_t numCategories_;
+  std::vector<UInt> categories_;
 
   /**
    * 2D map used to store the data.
    * Use as: weights_[ input-bit ][ category-index ]
    * Real64 (not just Real) so the computations do not lose precision.
    */
-  std::vector<std::vector<Real64>> weights_;
+  std::vector<std::unordered_map<UInt, Real64>> weights_;
 
   // Helper function to compute the error signal for learning.
   std::vector<Real64> calculateError_(const std::vector<UInt> &bucketIdxList,