xgboost/src/data/simple_csr_source.cc
其他
2018-12-13 09:31:07
阅读次数: 0
xgboost/src/data/simple_csr_source.cc
121 lines (108 sloc) 3.78 KB
/*!
* Copyright 2015 by Contributors
* \file simple_csr_source.cc
*/
#include <dmlc/base.h>
#include <xgboost/logging.h>
#include <limits>
#include "./simple_csr_source.h"
namespace xgboost {
namespace data {
void SimpleCSRSource::Clear() {
page_.Clear();
this->info.Clear();
}
void SimpleCSRSource::CopyFrom(DMatrix* src) {
this->Clear();
this->info = src->Info();
for (const auto &batch : src->GetRowBatches()) {
page_.Push(batch);
}
}
void SimpleCSRSource::CopyFrom(dmlc::Parser<uint32_t>* parser) {
// use qid to get group info
const uint64_t default_max = std::numeric_limits<uint64_t>::max();
uint64_t last_group_id = default_max;
bst_uint group_size = 0;
this->Clear();
while (parser->Next()) {
const dmlc::RowBlock<uint32_t>& batch = parser->Value();
if (batch.label != nullptr) {
auto& labels = info.labels_.HostVector();
labels.insert(labels.end(), batch.label, batch.label + batch.size);
}
if (batch.weight != nullptr) {
auto& weights = info.weights_.HostVector();
weights.insert(weights.end(), batch.weight, batch.weight + batch.size);
}
if (batch.qid != nullptr) {
info.qids_.insert(info.qids_.end(), batch.qid, batch.qid + batch.size);
// get group
for (size_t i = 0; i < batch.size; ++i) {
const uint64_t cur_group_id = batch.qid[i];
if (last_group_id == default_max || last_group_id != cur_group_id) {
info.group_ptr_.push_back(group_size);
}
last_group_id = cur_group_id;
++group_size;
}
}
// Remove the assertion on batch.index, which can be null in the case that the data in this
// batch is entirely sparse. Although it's true that this indicates a likely issue with the
// user's data workflows, passing XGBoost entirely sparse data should not cause it to fail.
// See https://github.com/dmlc/xgboost/issues/1827 for complete detail.
// CHECK(batch.index != nullptr);
// update information
this->info.num_row_ += batch.size;
// copy the data over
auto& data_vec = page_.data.HostVector();
auto& offset_vec = page_.offset.HostVector();
for (size_t i = batch.offset[0]; i < batch.offset[batch.size]; ++i) {
uint32_t index = batch.index[i];
bst_float fvalue = batch.value == nullptr ? 1.0f : batch.value[i];
data_vec.emplace_back(index, fvalue);
this->info.num_col_ = std::max(this->info.num_col_,
static_cast<uint64_t>(index + 1));
}
size_t top = page_.offset.Size();
for (size_t i = 0; i < batch.size; ++i) {
offset_vec.push_back(offset_vec[top - 1] + batch.offset[i + 1] - batch.offset[0]);
}
}
if (last_group_id != default_max) {
if (group_size > info.group_ptr_.back()) {
info.group_ptr_.push_back(group_size);
}
}
this->info.num_nonzero_ = static_cast<uint64_t>(page_.data.Size());
// Either every row has query ID or none at all
CHECK(info.qids_.empty() || info.qids_.size() == info.num_row_);
}
void SimpleCSRSource::LoadBinary(dmlc::Stream* fi) {
int tmagic;
CHECK(fi->Read(&tmagic, sizeof(tmagic)) == sizeof(tmagic)) << "invalid input file format";
CHECK_EQ(tmagic, kMagic) << "invalid format, magic number mismatch";
info.LoadBinary(fi);
fi->Read(&page_.offset.HostVector());
fi->Read(&page_.data.HostVector());
}
void SimpleCSRSource::SaveBinary(dmlc::Stream* fo) const {
int tmagic = kMagic;
fo->Write(&tmagic, sizeof(tmagic));
info.SaveBinary(fo);
fo->Write(page_.offset.HostVector());
fo->Write(page_.data.HostVector());
}
void SimpleCSRSource::BeforeFirst() {
at_first_ = true;
}
bool SimpleCSRSource::Next() {
if (!at_first_) return false;
at_first_ = false;
return true;
}
const SparsePage& SimpleCSRSource::Value() const {
return page_;
}
} // namespace data
} // namespace xgboost
转载自blog.csdn.net/lusic01/article/details/84871838