Column-oriented GPU-accelerated Database Management System
CoGaDB
/home/sebastian/gpudbms/trunk/cogadb/include/compression/rle_compressed_column.hpp
Go to the documentation of this file.
00001 #pragma once
00002 
00003 #include <core/compressed_column.hpp>
00004 #include <sstream>
00005 #include <string> 
00006 #include <fstream> 
00007 #include <boost/archive/binary_iarchive.hpp>
00008 #include <boost/archive/binary_oarchive.hpp>
00009 #include <boost/serialization/map.hpp>
00010 
00011 namespace CoGaDB{
00012 
00013         template<class T>
00014         class RLECompressedColumn : public CompressedColumn<T>{
00015         public:
00016                 RLECompressedColumn(const std::string& name, AttributeType db_type);
00017                 ~RLECompressedColumn();
00018 
00019                 bool insert(const boost::any& new_Value);
00020                 bool insert(const T& new_value);
00021                 template <typename InputIterator>
00022                 bool insert(InputIterator first, InputIterator last);
00023 
00024                 bool update(TID tid, const boost::any& new_value);
00025                 bool update(PositionListPtr tid, const boost::any& new_value);  
00026 
00027                 bool remove(TID tid);
00028                 //assumes tid list is sorted ascending
00029                 bool remove(PositionListPtr tid);
00030                 bool clearContent();
00031 
00032                 const boost::any get(TID tid);
00033                 void print() const throw();
00034                 size_t size() const throw();
00035                 unsigned int getSizeinBytes() const throw();
00036 
00037                 const ColumnPtr copy() const;
00038 
00039                 bool store(const std::string& path);
00040                 bool load(const std::string& path);
00041 
00042                 T& operator[](const int index);
00043 
00044         private:
00045                 std::vector<T> values_;
00046                 std::vector<unsigned char> count_;
00047                 std::string last_compressed_value;
00048         };
00049 
00050 
00051         /***************** Start of Implementation Section ******************/
00052 
00053 
00054         template<class T>
00055         RLECompressedColumn<T>::RLECompressedColumn(const std::string& name, AttributeType db_type) : CompressedColumn<T>(name, db_type), values_(), count_(), last_compressed_value() {}
00056 
00057         template<class T>
00058         RLECompressedColumn<T>::~RLECompressedColumn() {}
00059 
00060         template<class T>
00061         bool RLECompressedColumn<T>::insert(const boost::any& new_value){
00062                 if(new_value.empty()) return false;
00063                 if(typeid(T)==new_value.type()){
00064                         return insert(boost::any_cast<T>(new_value));
00065                 }
00066                 return false;
00067         }
00068 
00069         template<class T>
00070         bool RLECompressedColumn<T>::insert(const T& new_value){
00071                 //row
00072                 if (!values_.empty() &&
00073                         new_value == values_.back() &&
00074                         count_.back() < 255) {
00075                                 count_[count_.size() - 1] = count_.back() + 1;
00076                 }
00077                 else {
00078                         values_.push_back(new_value);
00079                         count_.push_back(0);
00080                 }
00081 
00082                 return true;
00083         }
00084 
00085         std::string rle_encode_string(const std::string s) {
00086                 int count = 0;
00087                 std::stringstream result_str;
00088                 char found = '\0';
00089                 for (unsigned int i = 0; i < s.length(); i++) {
00090 
00091                         const char c = s[i];
00092 
00093                         //              for(const char c : s) {
00094                         if (i == 0) { //first char
00095                                 found = c;
00096                         }
00097                         else if (found == c) { //following char
00098                                 if (count >= 255) { //if count is bigger than max char write new char
00099                                         result_str << (char) 255 << (char) found;
00100                                         count = count - 255;
00101                                 }
00102                                 else count++;
00103                         }
00104                         else { //new char, wirte last found, reset count
00105                                 result_str << (char) count << (char) found;
00106                                 found = c;
00107                                 count = 0;
00108                         }
00109                 }
00110                 result_str << (char) count << (char) found; //write last char
00111                 return result_str.str();
00112         }
00113 
00114         template<>
00115         bool RLECompressedColumn<std::string>::insert(const std::string& new_value){
00116                 std::string result = rle_encode_string(new_value);
00117                 if ((!values_.empty()) &&
00118                         (result == values_.back()) &&
00119                         (count_.back() < 255)) {
00120                                 count_[count_.size()-1] = count_.back()+1;
00121                 }
00122                 else {
00123                         values_.push_back(result);
00124                         count_.push_back(0);
00125                 }
00126                 return true;
00127         }
00128 
00129 
00130         template <typename T> 
00131         template <typename InputIterator>
00132         bool RLECompressedColumn<T>::insert(InputIterator first, InputIterator last){        
00133                 for (InputIterator it = first; it != last; it++) 
00134                 {
00135                         insert(*it);
00136                 }
00137                 return true;
00138         }
00139 
00140         template<class T>
00141         const boost::any RLECompressedColumn<T>::get(TID tid){
00142                 return boost::any(operator [] (tid));
00143         }
00144 
00145         template<class T>
00146         void RLECompressedColumn<T>::print() const throw(){
00147                 std::cout << "| " << this->name_ << " |" << std::endl;
00148                 std::cout << "________________________" << std::endl;
00149                 for(unsigned int i=0;i<values_.size() && i<count_.size();i++){
00150                         std::cout << "| " << values_[i] << " * " << (int) count_[i] + 1 << " | " << std::endl;
00151                 }
00152         }
00153         template<class T>
00154         size_t RLECompressedColumn<T>::size() const throw(){
00155                 int size_sum = 0;
00156                 for (unsigned int i = 0; i < count_.size(); i++)
00157                 {
00158                         size_sum += count_[i] + 1;
00159                 }
00160                 return size_sum;
00161         }
00162         template<class T>
00163         const ColumnPtr RLECompressedColumn<T>::copy() const{
00164                 return ColumnPtr(new RLECompressedColumn<T>(*this));
00165         }
00166 
00167         template<class T>
00168         bool RLECompressedColumn<T>::update(TID tid, const boost::any& new_value){
00169                 if(new_value.empty()) {
00170                         std::cout << "E: Update: new value is empty" << std::endl;
00171                         return false;
00172                 }
00173                 unsigned int len = size();
00174                 if(len <= tid) {
00175                         return false;
00176                 }
00177                 if(typeid(T)==new_value.type()){
00178                         //cast value to T
00179                         T new_T_value = boost::any_cast<T>(new_value);
00180 
00181                         //decode value
00182                         std::vector<T> decoded_values_;
00183                         for (unsigned int i = 0; i < len; i++)
00184                         {
00185                                 T value = operator[](i);
00186                                 decoded_values_.push_back(value);
00187                         }
00188 
00189                         //update decoded_values_
00190                         decoded_values_[tid] = new_T_value;
00191 
00192                         //update values_ -> clearContent + insert(decoded) -> encoded values_
00193                         clearContent();
00194                         for (unsigned int i = 0; i < decoded_values_.size(); i++) 
00195                         {
00196                                 T v = decoded_values_[i];
00197                                 insert(v);
00198                         }
00199 
00200                         return true;
00201                 }else{
00202                         std::cout << "Fatal Error!!! Typemismatch for column " << this->name_ << std::endl; 
00203                 }
00204                 return false;
00205         }
00206 
00207         template<class T>
00208         bool RLECompressedColumn<T>::update(PositionListPtr tids, const boost::any& new_value){
00209                 if(!tids)
00210                         return false;
00211                 if(new_value.empty()) return false;
00212                 TID tid;
00213                 for(unsigned int i=0;i<tids->size();i++){
00214                         tid=(*tids)[i];
00215                         update(tid, new_value);
00216                 }
00217                 return true;    
00218         }
00219 
00220         template<class T>
00221         bool RLECompressedColumn<T>::remove(TID tid){
00222                 unsigned int len = size();
00223                 if (len <= tid) 
00224                         return false;
00225 
00226                 //decode value
00227                 std::vector<T> decoded_values_;
00228                 for (unsigned int i = 0; i < len; i++)
00229                 {
00230                         T value = operator[](i);
00231                         decoded_values_.push_back(value);
00232                 }
00233 
00234                 //erase value at tid
00235                 decoded_values_.erase(decoded_values_.begin()+tid);
00236 
00237                 //update values_ -> clearContent + insert(decoded) -> encoded values_
00238                 clearContent();
00239                 for (unsigned int i = 0; i < decoded_values_.size(); i++) 
00240                 {
00241                         T v = decoded_values_[i];
00242                         insert(v);
00243                 }
00244 
00245                 return true;
00246         }
00247 
00248         template<class T>
00249         bool RLECompressedColumn<T>::remove(PositionListPtr tids){
00250                 if(!tids)
00251                         return false;
00252                 if(tids->empty())
00253                         return false;           
00254 
00255                 typename PositionList::reverse_iterator rit;
00256 
00257                 for (rit = tids->rbegin(); rit!=tids->rend(); ++rit)
00258                 {
00259                         remove(*rit);
00260                 }
00261 
00262                 return true;            
00263         }
00264 
00265         template<class T>
00266         bool RLECompressedColumn<T>::clearContent(){
00267                 values_.clear();
00268                 count_.clear();
00269                 return true;
00270         }
00271 
00272         template<class T>
00273         bool RLECompressedColumn<T>::store(const std::string& path_){
00274                 std::string path1(path_);
00275                 path1 += "/";
00276                 path1 += this->name_;
00277                 path1 += " values";
00278 
00279                 std::ofstream outfile1 (path1.c_str(), std::ios_base::binary | std::ios_base::out);
00280                 boost::archive::binary_oarchive oa1(outfile1);
00281                 oa1 << values_;
00282                 outfile1.flush();
00283                 outfile1.close();
00284 
00285                 std::string path2(path_);
00286                 path2 += "/";
00287                 path2 += this->name_;
00288                 path2 += " count";
00289 
00290                 std::ofstream outfile2 (path2.c_str(), std::ios_base::binary | std::ios_base::out);
00291                 boost::archive::binary_oarchive oa2(outfile2);
00292                 oa2 << count_;
00293                 outfile2.flush();
00294                 outfile2.close();
00295 
00296                 return true;
00297         }
00298 
00299         template<class T>
00300         bool RLECompressedColumn<T>::load(const std::string& path_){
00301                 std::string path1(path_);
00302 
00303                 path1 += "/";
00304                 path1 += this->name_;
00305                 path1 += " values";
00306 
00307                 std::ifstream infile1 (path1.c_str(),std::ios_base::binary | std::ios_base::in);
00308                 boost::archive::binary_iarchive ia1(infile1);
00309                 ia1 >> values_;
00310                 infile1.close();
00311 
00312                 std::string path2(path_);
00313 
00314                 path2 += "/";
00315                 path2 += this->name_;
00316                 path2 += " count";
00317 
00318                 std::ifstream infile2 (path2.c_str(),std::ios_base::binary | std::ios_base::in);
00319                 boost::archive::binary_iarchive ia2(infile2);
00320                 ia2 >> count_;
00321                 infile2.close();
00322 
00323                 return true;
00324         }
00325 
00326         template<>
00327         std::string& RLECompressedColumn<std::string>::operator[](const int index){
00328                 //find value
00329                 unsigned int row_sum = 0;
00330                 unsigned int i = 0;
00331                 for (; row_sum + count_[i] < index; i++)
00332                 {
00333                         row_sum += ((count_[i]) + 1);
00334                 }
00335                 std::string encoded = values_[i];
00336 
00337                 //decode string
00338                 std::string decoded;
00339                 for (unsigned int j = 0; j < encoded.size(); j+=2)
00340                 {
00341                         unsigned char c = encoded[j];
00342                         for (unsigned int k = 0; k <= c; k++)
00343                         {
00344                                 decoded.push_back(encoded[j+1]);
00345                         }
00346                 }
00347 
00348                 last_compressed_value = decoded;
00349                 return last_compressed_value;
00350         }
00351 
00352         template<class T>
00353         T& RLECompressedColumn<T>::operator[](const int index){
00354                 int row_sum = 0;
00355                 unsigned int i = 0;
00356                 for (; row_sum + count_[i] < index; i++)
00357                 {
00358                         row_sum += ((count_[i]) + 1);
00359                 }
00360                 return values_[i];
00361         }
00362 
00363 
00364         template<class T>
00365         unsigned int RLECompressedColumn<T>::getSizeinBytes() const throw(){
00366                 return (values_.capacity()*sizeof(T)) + (values_.capacity()*sizeof(char));
00367         }
00368 
00369         template<>
00370         inline unsigned int RLECompressedColumn<std::string>::getSizeinBytes() const throw(){
00371                 unsigned int size_in_bytes=0;
00372                 for(unsigned int i=0;i<values_.size();++i){
00373                         size_in_bytes+=values_[i].capacity();
00374                 }
00375                 return size_in_bytes;
00376         }
00377 
00378 };
00379 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines