7 #ifndef XGBOOST_COMMON_HIST_UTIL_H_ 8 #define XGBOOST_COMMON_HIST_UTIL_H_ 21 #include "../tree/param.h" 24 #include "../include/rabit/rabit.h" 64 *
this = std::forward<HistogramCuts&&>(that);
78 monitor_ = std::move(that.monitor_);
79 cut_ptrs_ = std::move(that.cut_ptrs_);
80 cut_values_ = std::move(that.cut_values_);
81 min_vals_ = std::move(that.min_vals_);
108 auto it = std::upper_bound(values.cbegin() + beg, values.cbegin() + end, value);
109 BinIdx idx = it - values.cbegin();
131 static bool UseGroup(
DMatrix* dmat);
132 static bool UseGroup(
MetaInfo const& info);
142 size_t const base_rowid) {
143 CHECK_LT(base_rowid, group_ptr.back())
144 <<
"Row: " << base_rowid <<
" is not found in any group.";
146 std::upper_bound(group_ptr.cbegin(), group_ptr.cend() - 1, base_rowid);
147 bst_group_t group_ind = it - group_ptr.cbegin() - 1;
151 void AddCutPoint(WQSketch::SummaryContainer
const& summary,
int max_bin) {
152 size_t required_cuts = std::min(summary.size, static_cast<size_t>(max_bin));
153 for (
size_t i = 1; i < required_cuts; ++i) {
162 virtual void Build(
DMatrix* dmat, uint32_t
const max_num_bins) = 0;
168 static std::vector<size_t> LoadBalance(
SparsePage const& page,
size_t const nthreads);
174 monitor_.
Init(__FUNCTION__);
178 void Concat(std::vector<std::unique_ptr<SparseCuts>>
const& cuts, uint32_t n_cols);
181 uint32_t max_num_bins,
182 bool const use_group_ind,
183 uint32_t beg, uint32_t end, uint32_t thread_id);
184 void Build(
DMatrix* dmat, uint32_t
const max_num_bins)
override;
195 monitor_.
Init(__FUNCTION__);
197 void Init(std::vector<WQSketch>* sketchs, uint32_t max_num_bins,
size_t max_rows);
198 void Build(
DMatrix* p_fmat, uint32_t max_num_bins)
override;
210 SetBinTypeSize(binTypeSize_);
217 if (offset_ptr_ !=
nullptr) {
218 return func_(data_ptr_, i) + offset_ptr_[i%p_];
220 return func_(data_ptr_, i);
224 binTypeSize_ = binTypeSize;
225 switch (binTypeSize) {
227 func_ = &GetValueFromUint8;
230 func_ = &GetValueFromUint16;
233 func_ = &GetValueFromUint32;
246 return static_cast<T*
>(data_ptr_);
252 return offset_.size();
255 return data_.size() / (binTypeSize_);
258 data_.resize(nBytesData);
259 data_ptr_ =
reinterpret_cast<void*
>(data_.data());
262 offset_.resize(nDisps);
263 offset_ptr_ = offset_.data();
266 std::vector<uint8_t>::const_iterator
begin()
const {
267 return data_.begin();
269 std::vector<uint8_t>::const_iterator
end()
const {
274 static uint32_t GetValueFromUint8(
void *t,
size_t i) {
275 return reinterpret_cast<uint8_t*
>(t)[i];
277 static uint32_t GetValueFromUint16(
void* t,
size_t i) {
278 return reinterpret_cast<uint16_t*
>(t)[i];
280 static uint32_t GetValueFromUint32(
void* t,
size_t i) {
281 return reinterpret_cast<uint32_t*
>(t)[i];
284 using Func = uint32_t (*)(
void*, size_t);
286 std::vector<uint8_t> data_;
287 std::vector<uint32_t> offset_;
291 uint32_t* offset_ptr_ {
nullptr};
314 void Init(
DMatrix* p_fmat,
int max_num_bins);
316 template<
typename BinIdxType>
318 size_t batch_threads,
const SparsePage& batch,
324 size_t batch_threads,
const SparsePage& batch,
325 size_t rbegin,
size_t nbins);
327 void ResizeIndex(
const size_t rbegin,
const SparsePage& batch,
328 const size_t n_offsets,
const size_t n_index,
332 auto nfeature = cut.
Ptrs().size() - 1;
333 for (
unsigned fid = 0; fid < nfeature; ++fid) {
334 auto ibegin = cut.
Ptrs()[fid];
335 auto iend = cut.
Ptrs()[fid + 1];
336 for (
auto i = ibegin; i < iend; ++i) {
337 counts[fid] += hit_count[i];
346 std::vector<size_t> hit_count_tloc_;
355 : row_ptr(row_ptr), index(index) {}
359 return {&index[0] + row_ptr[i], row_ptr[i + 1] - row_ptr[i]};
369 const tree::TrainParam& param);
372 return {blocks_[i].row_ptr_begin, blocks_[i].index_begin};
376 return blocks_.size();
380 std::vector<size_t> row_ptr_;
381 std::vector<uint32_t> index_;
384 const size_t* row_ptr_begin;
385 const size_t* row_ptr_end;
386 const uint32_t* index_begin;
387 const uint32_t* index_end;
389 std::vector<Block> blocks_;
392 template<
typename GradientSumT>
398 template<
typename GradientSumT>
404 template<
typename GradientSumT>
406 size_t begin,
size_t end);
411 template<
typename GradientSumT>
413 size_t begin,
size_t end);
418 template<
typename GradientSumT>
421 size_t begin,
size_t end);
426 template<
typename GradientSumT>
434 constexpr uint32_t kMax = std::numeric_limits<uint32_t>::max();
435 CHECK_NE(row_ptr_[nid], kMax);
437 const_cast<GradientPairT*
>(dmlc::BeginPtr(data_) + row_ptr_[nid]);
438 return {ptr, nbins_};
443 const uint32_t k_max = std::numeric_limits<uint32_t>::max();
444 return (nid < row_ptr_.size() && row_ptr_[nid] != k_max);
449 if (nbins_ != nbins) {
460 constexpr uint32_t kMax = std::numeric_limits<uint32_t>::max();
461 if (nid >= row_ptr_.size()) {
462 row_ptr_.resize(nid + 1, kMax);
464 CHECK_EQ(row_ptr_[nid], kMax);
466 if (data_.size() < nbins_ * (nid + 1)) {
467 data_.resize(nbins_ * (nid + 1));
470 row_ptr_[nid] = nbins_ * n_nodes_added_;
478 uint32_t n_nodes_added_ = 0;
480 std::vector<GradientPairT> data_;
483 std::vector<size_t> row_ptr_;
491 template<
typename GradientSumT>
497 if (nbins != nbins_) {
498 hist_buffer_.Init(nbins);
506 const std::vector<GHistRowT>& targeted_hists) {
507 hist_buffer_.Init(nbins_);
508 tid_nid_to_hist_.clear();
509 hist_memory_.clear();
510 threads_to_nids_map_.clear();
512 targeted_hists_ = targeted_hists;
514 CHECK_EQ(nodes, targeted_hists.size());
517 nthreads_ = nthreads;
519 MatchThreadsToNodes(space);
520 AllocateAdditionalHistograms();
521 MatchNodeNidPairToHist();
523 hist_was_used_.resize(nthreads * nodes_);
524 std::fill(hist_was_used_.begin(), hist_was_used_.end(),
static_cast<int>(
false));
529 CHECK_LT(nid, nodes_);
530 CHECK_LT(tid, nthreads_);
532 size_t idx = tid_nid_to_hist_.at({tid, nid});
535 if (!hist_was_used_[tid * nodes_ + nid]) {
537 hist_was_used_[tid * nodes_ + nid] =
static_cast<int>(
true);
545 CHECK_GT(end, begin);
546 CHECK_LT(nid, nodes_);
550 bool is_updated =
false;
551 for (
size_t tid = 0; tid < nthreads_; ++tid) {
552 if (hist_was_used_[tid * nodes_ + nid]) {
554 const size_t idx = tid_nid_to_hist_.at({tid, nid});
571 const size_t space_size = space.
Size();
572 const size_t chunck_size = space_size / nthreads_ + !!(space_size % nthreads_);
574 threads_to_nids_map_.resize(nthreads_ * nodes_,
false);
576 for (
size_t tid = 0; tid < nthreads_; ++tid) {
577 size_t begin = chunck_size * tid;
578 size_t end = std::min(begin + chunck_size, space_size);
580 if (begin < space_size) {
584 for (
size_t nid = nid_begin; nid <= nid_end; ++nid) {
586 threads_to_nids_map_[tid * nodes_ + nid] =
true;
593 size_t hist_allocated_additionally = 0;
595 for (
size_t nid = 0; nid < nodes_; ++nid) {
596 int nthreads_for_nid = 0;
598 for (
size_t tid = 0; tid < nthreads_; ++tid) {
599 if (threads_to_nids_map_[tid * nodes_ + nid]) {
608 hist_allocated_additionally += std::max<int>(0, nthreads_for_nid - 1);
611 for (
size_t i = 0; i < hist_allocated_additionally; ++i) {
612 hist_buffer_.AddHistRow(i);
617 size_t hist_total = 0;
618 size_t hist_allocated_additionally = 0;
620 for (
size_t nid = 0; nid < nodes_; ++nid) {
621 bool first_hist =
true;
622 for (
size_t tid = 0; tid < nthreads_; ++tid) {
623 if (threads_to_nids_map_[tid * nodes_ + nid]) {
625 hist_memory_.push_back(targeted_hists_[nid]);
628 hist_memory_.push_back(hist_buffer_[hist_allocated_additionally]);
629 hist_allocated_additionally++;
632 tid_nid_to_hist_[{tid, nid}] = hist_total++;
633 CHECK_EQ(hist_total, hist_memory_.size());
642 size_t nthreads_ = 0;
667 template<
typename GradientSumT>
673 GHistBuilder(
size_t nthread, uint32_t nbins) : nthread_{nthread}, nbins_{nbins} {}
676 void BuildHist(
const std::vector<GradientPair>& gpair,
682 void BuildBlockHist(
const std::vector<GradientPair>& gpair,
697 size_t nthread_ { 0 };
699 uint32_t nbins_ { 0 };
705 #endif // XGBOOST_COMMON_HIST_UTIL_H_ DenseCuts(HistogramCuts *container)
Definition: hist_util.h:193
Definition: hist_util.h:208
void SetBinTypeSize(BinTypeSize binTypeSize)
Definition: hist_util.h:223
Index index
The index data.
Definition: hist_util.h:306
float bst_float
float type, used for storing statistics
Definition: base.h:111
BinIdx SearchBin(float value, uint32_t column_id) const
Definition: hist_util.h:104
XGBOOST_DEVICE constexpr index_type size() const __span_noexcept
Definition: span.h:531
void Copy(const HostDeviceVector< T > &other)
HistCollection< GradientSumT > hist_buffer_
Buffer for additional histograms for Parallel processing.
Definition: hist_util.h:646
size_t GetNumBlock() const
Definition: hist_util.h:375
uint32_t FeatureBins(uint32_t feature) const
Definition: hist_util.h:88
Definition: hist_util.h:350
BinIdx SearchBin(Entry const &e) const
Definition: hist_util.h:116
static uint32_t SearchGroupIndFromRow(std::vector< bst_uint > const &group_ptr, size_t const base_rowid)
Definition: hist_util.h:141
void MatchNodeNidPairToHist()
Definition: hist_util.h:616
std::vector< uint32_t > const & Ptrs() const
Definition: hist_util.h:96
void SubtractionHist(GHistRow< GradientSumT > dst, const GHistRow< GradientSumT > src1, const GHistRow< GradientSumT > src2, size_t begin, size_t end)
Compute Subtraction: dst = src1 - src2 in range [begin, end)
util to compute quantiles
void CopyHist(GHistRow< GradientSumT > dst, const GHistRow< GradientSumT > src, size_t begin, size_t end)
Copy hist from src to dst in range [begin, end)
Definition: hist_util.h:203
The input data structure of xgboost.
T * data() const
Definition: hist_util.h:245
HistogramCuts & operator=(HistogramCuts const &that)
Definition: hist_util.h:67
HistogramCuts cut
The corresponding cuts.
Definition: hist_util.h:310
void Reset(size_t nthreads, size_t nodes, const BlockedSpace2d &space, const std::vector< GHistRowT > &targeted_hists)
Definition: hist_util.h:505
Internal data structured used by XGBoost during training.
Definition: data.h:464
size_t GetFirstDimension(size_t i) const
Definition: threading_utils.h:89
Cut configuration for dense dataset.
Definition: hist_util.h:188
In-memory storage unit of sparse batch, stored in CSR format.
Definition: data.h:245
bool IsDense() const
Definition: hist_util.h:341
HistogramCuts & operator=(HistogramCuts &&that) noexcept(true)
Definition: hist_util.h:77
Definition: hist_util.h:127
void Init(uint32_t nbins)
Definition: hist_util.h:448
std::vector< size_t > hit_count
hit count of each index
Definition: hist_util.h:308
span class implementation, based on ISO++20 span<T>. The interface should be the same.
Definition: span.h:126
Definition: hist_util.h:204
builder for histograms of gradient statistics
Definition: hist_util.h:668
size_t OffsetSize() const
Definition: hist_util.h:251
void AllocateAdditionalHistograms()
Definition: hist_util.h:592
Implementation of gradient statistics pair. Template specialisation may be used to overload different...
Definition: base.h:132
std::vector< float > const & MinValues() const
Definition: hist_util.h:98
HostDeviceVector< bst_float > cut_values_
Definition: hist_util.h:48
GHistIndexBlock operator[](size_t i) const
Definition: hist_util.h:371
Quantile sketch use WQSummary.
Definition: quantile.h:672
XGBOOST_DEVICE constexpr pointer data() const __span_noexcept
Definition: span.h:526
Quick Utility to compute subset of rows.
HistogramCuts(HistogramCuts const &that)
Definition: hist_util.h:54
void Init(std::string label)
Definition: timer.h:82
BinTypeSize GetBinTypeSize() const
Definition: hist_util.h:241
size_t TotalBins() const
Definition: hist_util.h:100
HistogramCuts(HistogramCuts &&that) noexcept(true)
Definition: hist_util.h:63
std::vector< uint8_t >::const_iterator begin() const
Definition: hist_util.h:266
std::vector< uint8_t >::const_iterator end() const
Definition: hist_util.h:269
Cut configuration for sparse dataset.
Definition: hist_util.h:166
const size_t * row_ptr
Definition: hist_util.h:351
size_t Size() const
Definition: threading_utils.h:84
histogram of gradient statistics for multiple nodes
Definition: hist_util.h:427
uint32_t bst_group_t
Type for ranking group index.
Definition: base.h:125
std::vector< bool > threads_to_nids_map_
Buffer for additional histograms for Parallel processing.
Definition: hist_util.h:655
Definition: hist_util.h:365
HistogramCuts * p_cuts_
Definition: hist_util.h:135
void AddHistRow(bst_uint nid)
Definition: hist_util.h:459
common::Monitor monitor_
Definition: hist_util.h:45
std::vector< T > & HostVector()
void MatchThreadsToNodes(const BlockedSpace2d &space)
Definition: hist_util.h:570
void Build(DMatrix *dmat, uint32_t const max_num_bins)
void Resize(const size_t nBytesData)
Definition: hist_util.h:257
GHistRowT operator[](bst_uint nid) const
Definition: hist_util.h:433
uint32_t operator[](size_t i) const
Definition: hist_util.h:216
Index()
Definition: hist_util.h:209
void IncrementHist(GHistRow< GradientSumT > dst, const GHistRow< GradientSumT > add, size_t begin, size_t end)
Increment hist as dst += add in range [begin, end)
GHistIndexRow operator[](size_t i) const
Definition: hist_util.h:358
a collection of columns, with support for construction from GHistIndexMatrix.
Definition: column_matrix.h:101
HostDeviceVector< float > min_vals_
Definition: hist_util.h:51
BinTypeSize
Definition: hist_util.h:202
SparseCuts(HistogramCuts *container)
Definition: hist_util.h:172
GHistBuilder(size_t nthread, uint32_t nbins)
Definition: hist_util.h:673
void Init(size_t nbins)
Definition: hist_util.h:496
uint32_t BinIdx
Definition: hist_util.h:44
namespace of xgboost
Definition: base.h:102
size_t Size() const
Definition: hist_util.h:254
data structure to store an instance set, a subset of rows (instances) associated with a particular no...
Definition: row_set.h:24
uint32_t * Offset() const
Definition: hist_util.h:248
const std::vector< T > & ConstHostVector() const
bool RowExists(bst_uint nid) const
Definition: hist_util.h:442
Definition: threading_utils.h:54
Timing utility used to measure total method execution time over the lifetime of the containing object...
Definition: timer.h:47
size_t max_num_bins
Definition: hist_util.h:312
uint32_t GetNumBins() const
Definition: hist_util.h:691
CutsBuilder(HistogramCuts *p_cuts)
Definition: hist_util.h:138
void AddCutPoint(WQSketch::SummaryContainer const &summary, int max_bin)
Definition: hist_util.h:151
std::vector< size_t > row_ptr
row pointer to rows by element position
Definition: hist_util.h:304
Stores temporary histograms to compute them in parallel Supports processing multiple tree-nodes for n...
Definition: hist_util.h:492
Element from a sparse vector.
Definition: data.h:201
std::vector< GHistRowT > hist_memory_
Allocated memory for histograms used for construction.
Definition: hist_util.h:659
std::map< std::pair< size_t, size_t >, size_t > tid_nid_to_hist_
map pair {tid, nid} to index of allocated histogram from hist_memory_
Definition: hist_util.h:661
uint32_t bst_uint
unsigned integer type used for feature index.
Definition: base.h:105
Monitor monitor_
Definition: hist_util.h:190
HostDeviceVector< uint32_t > cut_ptrs_
Definition: hist_util.h:49
preprocessed global index matrix, in CSR format
Definition: hist_util.h:302
std::vector< float > const & Values() const
Definition: hist_util.h:97
bst_feature_t index
feature index
Definition: data.h:203
void InitilizeHistByZeroes(GHistRow< GradientSumT > hist, size_t begin, size_t end)
fill a histogram by zeros
std::vector< int > hist_was_used_
Marks which hists were used, it means that they should be merged. Contains only {true or false} value...
Definition: hist_util.h:652
bst_float fvalue
feature value
Definition: data.h:205
const uint32_t * index
Definition: hist_util.h:352
void GetFeatureCounts(size_t *counts) const
Definition: hist_util.h:331
GHistRowT GetInitializedHist(size_t tid, size_t nid)
Definition: hist_util.h:528
void Resize(size_t new_size, T v=T())
std::vector< GHistRowT > targeted_hists_
Contains histograms for final results.
Definition: hist_util.h:657
void ReduceHist(size_t nid, size_t begin, size_t end)
Definition: hist_util.h:544
void ResizeOffset(const size_t nDisps)
Definition: hist_util.h:261
Definition: hist_util.h:205
GHistIndexBlock(const size_t *row_ptr, const uint32_t *index)
Definition: hist_util.h:354
Definition: hist_util.h:36
DMatrix * p_fmat
Definition: hist_util.h:311