Column-oriented GPU-accelerated Database Management System
CoGaDB
|
00001 #pragma once 00002 00003 #include <core/compressed_column.hpp> 00004 #include <sstream> 00005 #include <string> 00006 #include <fstream> 00007 #include <boost/archive/binary_iarchive.hpp> 00008 #include <boost/archive/binary_oarchive.hpp> 00009 #include <boost/serialization/map.hpp> 00010 00011 namespace CoGaDB{ 00012 00013 template<class T> 00014 class RLECompressedColumn : public CompressedColumn<T>{ 00015 public: 00016 RLECompressedColumn(const std::string& name, AttributeType db_type); 00017 ~RLECompressedColumn(); 00018 00019 bool insert(const boost::any& new_Value); 00020 bool insert(const T& new_value); 00021 template <typename InputIterator> 00022 bool insert(InputIterator first, InputIterator last); 00023 00024 bool update(TID tid, const boost::any& new_value); 00025 bool update(PositionListPtr tid, const boost::any& new_value); 00026 00027 bool remove(TID tid); 00028 //assumes tid list is sorted ascending 00029 bool remove(PositionListPtr tid); 00030 bool clearContent(); 00031 00032 const boost::any get(TID tid); 00033 void print() const throw(); 00034 size_t size() const throw(); 00035 unsigned int getSizeinBytes() const throw(); 00036 00037 const ColumnPtr copy() const; 00038 00039 bool store(const std::string& path); 00040 bool load(const std::string& path); 00041 00042 T& operator[](const int index); 00043 00044 private: 00045 std::vector<T> values_; 00046 std::vector<unsigned char> count_; 00047 std::string last_compressed_value; 00048 }; 00049 00050 00051 /***************** Start of Implementation Section ******************/ 00052 00053 00054 template<class T> 00055 RLECompressedColumn<T>::RLECompressedColumn(const std::string& name, AttributeType db_type) : CompressedColumn<T>(name, db_type), values_(), count_(), last_compressed_value() {} 00056 00057 template<class T> 00058 RLECompressedColumn<T>::~RLECompressedColumn() {} 00059 00060 template<class T> 00061 bool RLECompressedColumn<T>::insert(const boost::any& new_value){ 00062 if(new_value.empty()) return false; 00063 if(typeid(T)==new_value.type()){ 00064 return insert(boost::any_cast<T>(new_value)); 00065 } 00066 return false; 00067 } 00068 00069 template<class T> 00070 bool RLECompressedColumn<T>::insert(const T& new_value){ 00071 //row 00072 if (!values_.empty() && 00073 new_value == values_.back() && 00074 count_.back() < 255) { 00075 count_[count_.size() - 1] = count_.back() + 1; 00076 } 00077 else { 00078 values_.push_back(new_value); 00079 count_.push_back(0); 00080 } 00081 00082 return true; 00083 } 00084 00085 std::string rle_encode_string(const std::string s) { 00086 int count = 0; 00087 std::stringstream result_str; 00088 char found = '\0'; 00089 for (unsigned int i = 0; i < s.length(); i++) { 00090 00091 const char c = s[i]; 00092 00093 // for(const char c : s) { 00094 if (i == 0) { //first char 00095 found = c; 00096 } 00097 else if (found == c) { //following char 00098 if (count >= 255) { //if count is bigger than max char write new char 00099 result_str << (char) 255 << (char) found; 00100 count = count - 255; 00101 } 00102 else count++; 00103 } 00104 else { //new char, wirte last found, reset count 00105 result_str << (char) count << (char) found; 00106 found = c; 00107 count = 0; 00108 } 00109 } 00110 result_str << (char) count << (char) found; //write last char 00111 return result_str.str(); 00112 } 00113 00114 template<> 00115 bool RLECompressedColumn<std::string>::insert(const std::string& new_value){ 00116 std::string result = rle_encode_string(new_value); 00117 if ((!values_.empty()) && 00118 (result == values_.back()) && 00119 (count_.back() < 255)) { 00120 count_[count_.size()-1] = count_.back()+1; 00121 } 00122 else { 00123 values_.push_back(result); 00124 count_.push_back(0); 00125 } 00126 return true; 00127 } 00128 00129 00130 template <typename T> 00131 template <typename InputIterator> 00132 bool RLECompressedColumn<T>::insert(InputIterator first, InputIterator last){ 00133 for (InputIterator it = first; it != last; it++) 00134 { 00135 insert(*it); 00136 } 00137 return true; 00138 } 00139 00140 template<class T> 00141 const boost::any RLECompressedColumn<T>::get(TID tid){ 00142 return boost::any(operator [] (tid)); 00143 } 00144 00145 template<class T> 00146 void RLECompressedColumn<T>::print() const throw(){ 00147 std::cout << "| " << this->name_ << " |" << std::endl; 00148 std::cout << "________________________" << std::endl; 00149 for(unsigned int i=0;i<values_.size() && i<count_.size();i++){ 00150 std::cout << "| " << values_[i] << " * " << (int) count_[i] + 1 << " | " << std::endl; 00151 } 00152 } 00153 template<class T> 00154 size_t RLECompressedColumn<T>::size() const throw(){ 00155 int size_sum = 0; 00156 for (unsigned int i = 0; i < count_.size(); i++) 00157 { 00158 size_sum += count_[i] + 1; 00159 } 00160 return size_sum; 00161 } 00162 template<class T> 00163 const ColumnPtr RLECompressedColumn<T>::copy() const{ 00164 return ColumnPtr(new RLECompressedColumn<T>(*this)); 00165 } 00166 00167 template<class T> 00168 bool RLECompressedColumn<T>::update(TID tid, const boost::any& new_value){ 00169 if(new_value.empty()) { 00170 std::cout << "E: Update: new value is empty" << std::endl; 00171 return false; 00172 } 00173 unsigned int len = size(); 00174 if(len <= tid) { 00175 return false; 00176 } 00177 if(typeid(T)==new_value.type()){ 00178 //cast value to T 00179 T new_T_value = boost::any_cast<T>(new_value); 00180 00181 //decode value 00182 std::vector<T> decoded_values_; 00183 for (unsigned int i = 0; i < len; i++) 00184 { 00185 T value = operator[](i); 00186 decoded_values_.push_back(value); 00187 } 00188 00189 //update decoded_values_ 00190 decoded_values_[tid] = new_T_value; 00191 00192 //update values_ -> clearContent + insert(decoded) -> encoded values_ 00193 clearContent(); 00194 for (unsigned int i = 0; i < decoded_values_.size(); i++) 00195 { 00196 T v = decoded_values_[i]; 00197 insert(v); 00198 } 00199 00200 return true; 00201 }else{ 00202 std::cout << "Fatal Error!!! Typemismatch for column " << this->name_ << std::endl; 00203 } 00204 return false; 00205 } 00206 00207 template<class T> 00208 bool RLECompressedColumn<T>::update(PositionListPtr tids, const boost::any& new_value){ 00209 if(!tids) 00210 return false; 00211 if(new_value.empty()) return false; 00212 TID tid; 00213 for(unsigned int i=0;i<tids->size();i++){ 00214 tid=(*tids)[i]; 00215 update(tid, new_value); 00216 } 00217 return true; 00218 } 00219 00220 template<class T> 00221 bool RLECompressedColumn<T>::remove(TID tid){ 00222 unsigned int len = size(); 00223 if (len <= tid) 00224 return false; 00225 00226 //decode value 00227 std::vector<T> decoded_values_; 00228 for (unsigned int i = 0; i < len; i++) 00229 { 00230 T value = operator[](i); 00231 decoded_values_.push_back(value); 00232 } 00233 00234 //erase value at tid 00235 decoded_values_.erase(decoded_values_.begin()+tid); 00236 00237 //update values_ -> clearContent + insert(decoded) -> encoded values_ 00238 clearContent(); 00239 for (unsigned int i = 0; i < decoded_values_.size(); i++) 00240 { 00241 T v = decoded_values_[i]; 00242 insert(v); 00243 } 00244 00245 return true; 00246 } 00247 00248 template<class T> 00249 bool RLECompressedColumn<T>::remove(PositionListPtr tids){ 00250 if(!tids) 00251 return false; 00252 if(tids->empty()) 00253 return false; 00254 00255 typename PositionList::reverse_iterator rit; 00256 00257 for (rit = tids->rbegin(); rit!=tids->rend(); ++rit) 00258 { 00259 remove(*rit); 00260 } 00261 00262 return true; 00263 } 00264 00265 template<class T> 00266 bool RLECompressedColumn<T>::clearContent(){ 00267 values_.clear(); 00268 count_.clear(); 00269 return true; 00270 } 00271 00272 template<class T> 00273 bool RLECompressedColumn<T>::store(const std::string& path_){ 00274 std::string path1(path_); 00275 path1 += "/"; 00276 path1 += this->name_; 00277 path1 += " values"; 00278 00279 std::ofstream outfile1 (path1.c_str(), std::ios_base::binary | std::ios_base::out); 00280 boost::archive::binary_oarchive oa1(outfile1); 00281 oa1 << values_; 00282 outfile1.flush(); 00283 outfile1.close(); 00284 00285 std::string path2(path_); 00286 path2 += "/"; 00287 path2 += this->name_; 00288 path2 += " count"; 00289 00290 std::ofstream outfile2 (path2.c_str(), std::ios_base::binary | std::ios_base::out); 00291 boost::archive::binary_oarchive oa2(outfile2); 00292 oa2 << count_; 00293 outfile2.flush(); 00294 outfile2.close(); 00295 00296 return true; 00297 } 00298 00299 template<class T> 00300 bool RLECompressedColumn<T>::load(const std::string& path_){ 00301 std::string path1(path_); 00302 00303 path1 += "/"; 00304 path1 += this->name_; 00305 path1 += " values"; 00306 00307 std::ifstream infile1 (path1.c_str(),std::ios_base::binary | std::ios_base::in); 00308 boost::archive::binary_iarchive ia1(infile1); 00309 ia1 >> values_; 00310 infile1.close(); 00311 00312 std::string path2(path_); 00313 00314 path2 += "/"; 00315 path2 += this->name_; 00316 path2 += " count"; 00317 00318 std::ifstream infile2 (path2.c_str(),std::ios_base::binary | std::ios_base::in); 00319 boost::archive::binary_iarchive ia2(infile2); 00320 ia2 >> count_; 00321 infile2.close(); 00322 00323 return true; 00324 } 00325 00326 template<> 00327 std::string& RLECompressedColumn<std::string>::operator[](const int index){ 00328 //find value 00329 unsigned int row_sum = 0; 00330 unsigned int i = 0; 00331 for (; row_sum + count_[i] < index; i++) 00332 { 00333 row_sum += ((count_[i]) + 1); 00334 } 00335 std::string encoded = values_[i]; 00336 00337 //decode string 00338 std::string decoded; 00339 for (unsigned int j = 0; j < encoded.size(); j+=2) 00340 { 00341 unsigned char c = encoded[j]; 00342 for (unsigned int k = 0; k <= c; k++) 00343 { 00344 decoded.push_back(encoded[j+1]); 00345 } 00346 } 00347 00348 last_compressed_value = decoded; 00349 return last_compressed_value; 00350 } 00351 00352 template<class T> 00353 T& RLECompressedColumn<T>::operator[](const int index){ 00354 int row_sum = 0; 00355 unsigned int i = 0; 00356 for (; row_sum + count_[i] < index; i++) 00357 { 00358 row_sum += ((count_[i]) + 1); 00359 } 00360 return values_[i]; 00361 } 00362 00363 00364 template<class T> 00365 unsigned int RLECompressedColumn<T>::getSizeinBytes() const throw(){ 00366 return (values_.capacity()*sizeof(T)) + (values_.capacity()*sizeof(char)); 00367 } 00368 00369 template<> 00370 inline unsigned int RLECompressedColumn<std::string>::getSizeinBytes() const throw(){ 00371 unsigned int size_in_bytes=0; 00372 for(unsigned int i=0;i<values_.size();++i){ 00373 size_in_bytes+=values_[i].capacity(); 00374 } 00375 return size_in_bytes; 00376 } 00377 00378 }; 00379