GenomicDataStream
A scalable interface between data and analysis
Loading...
Searching...
No Matches
bgen_load.h
Go to the documentation of this file.
1
5
6
7#ifndef BGEN_LOAD_H_
8#define BGEN_LOAD_H_
9
10#include <sstream>
11#include <map>
12#include <set>
13#include "genfile/bgen/View.hpp"
14#include "genfile/bgen/IndexQuery.hpp"
15
16using namespace std;
17
18namespace {
19
20 template< typename T >
21 string atoi( const T &value ) {
22 std::ostringstream stream ;
23 stream << value ;
24 return stream.str() ;
25 }
26
27 struct DataSetter {
28 DataSetter(
29 vector<int>* ploidy,
30 vector<int> const& ploidy_dimension,
31 vector<double> * data,
32 vector<int> const& data_dimension,
33 vector<bool>* phased,
34 size_t variant_i,
35 map<size_t, size_t> const& requested_samples
36 ):
37 m_ploidy( ploidy ),
38 m_ploidy_dimension( ploidy_dimension ),
39 m_data( data ),
40 m_data_dimension( data_dimension ),
41 m_phased( phased ),
42 m_variant_i( variant_i ),
43 m_requested_samples( requested_samples ),
44 m_requested_sample_i( m_requested_samples.begin() ),
45 m_storage_i( 0 ),
46 m_order_type( genfile::eUnknownOrderType )
47 {
48 assert( m_data_dimension[0] == m_ploidy_dimension[0] ) ;
49 assert( m_data_dimension[1] == m_ploidy_dimension[1] ) ;
50 assert( m_data_dimension[1] == m_requested_samples.size() ) ;
51 assert( m_variant_i < m_data_dimension[0] ) ;
52 assert( m_data_dimension[2] >= 3 ) ;
53 assert( m_phased->size() == m_data_dimension[0] ) ;
54 }
55
56 // Called once allowing us to set storage.
57 void initialise( size_t number_of_samples, size_t number_of_alleles ) {
58 }
59
60 // If present with this signature, called once after initialise()
61 // to set the minimum and maximum ploidy and numbers of probabilities among samples in the data.
62 // This enables us to set up storage for the data ahead of time.
63 void set_min_max_ploidy(
64 genfile::bgen::uint32_t min_ploidy, genfile::bgen::uint32_t max_ploidy,
65 genfile::bgen::uint32_t min_entries, genfile::bgen::uint32_t max_entries
66 ) {
67 if( max_entries > m_data_dimension[2] ) {
68 throw std::invalid_argument( "max_entries=" + atoi( max_entries )
69 + " (expected at most " + atoi( m_data_dimension[2] ) + ")" ) ;
70 }
71 }
72
73 // Called once per sample to determine whether we want data for this sample
74 bool set_sample( size_t i ) {
75 if( m_requested_sample_i != m_requested_samples.end() && m_requested_sample_i->first == i ) {
76 m_storage_i = m_requested_sample_i->second ;
77 ++m_requested_sample_i ;
78// #if DEBUG
79// std::cerr << "DataSetter::set_sample(): sample " << i << " has storage index " << m_storage_i << ".\n" ;
80// #endif
81 return true ;
82 } else {
83 // Don't want this sample
84 return false ;
85 }
86 }
87
88 // Called once per sample to set the number of probabilities that are present.
89 void set_number_of_entries(
90 size_t ploidy,
91 size_t number_of_entries,
92 genfile::OrderType order_type,
93 genfile::ValueType value_type
94 ) {
95 if( value_type != genfile::eProbability ) {
96 throw std::invalid_argument(
97 "value_type ("
98 + atoi( value_type ) + ", expected "
99 + atoi( genfile::eProbability ) + "=genfile::eProbability)"
100 ) ;
101 }
102 if( m_order_type == genfile::eUnknownOrderType ) {
103 m_order_type = order_type ;
104 // (*m_phased)( m_variant_i ) = ( m_order_type == genfile::ePerPhasedHaplotypePerAllele ) ;
105 (*m_phased)[ m_variant_i ] = ( m_order_type == genfile::ePerPhasedHaplotypePerAllele ) ;
106 } else {
107 assert( order_type == m_order_type ) ;
108 }
109 int const index = m_variant_i + m_storage_i * m_ploidy_dimension[0] ;
110 (*m_ploidy)[ index ] = ploidy ;
111 }
112
113 void set_value( genfile::bgen::uint32_t entry_i, double value ) {
114
115 int const index = m_variant_i + m_storage_i * m_data_dimension[0] + entry_i * m_data_dimension[0] * m_data_dimension[1] ;
116// #if DEBUG
117// std::cerr << "Setting data for index " << m_variant_i << ", " << m_storage_i << ", " << entry_i << ": index " << index << "...\n" << std::flush ;
118// #endif
119 (*m_data)[ index ] = value ;
120 }
121
122 void set_value( genfile::bgen::uint32_t entry_i, genfile::MissingValue value ) {
123 int const index = m_variant_i + m_storage_i * m_data_dimension[0] + entry_i * m_data_dimension[0] * m_data_dimension[1] ;
124// #if DEBUG
125// std::cerr << "Setting data for index " << m_variant_i << ", " << m_storage_i << ", " << entry_i << ": index " << index << "...\n" << std::flush ;
126// #endif
127 (*m_data)[ index ] = numeric_limits<double>::quiet_NaN() ;
128 }
129
130 void finalise() {
131 // nothing to do
132 }
133
134 private:
135 vector<int>* m_ploidy ;
136 vector<int> const m_ploidy_dimension ;
137 vector<double> * m_data ;
138 vector<int> const m_data_dimension ;
139 vector<bool> * m_phased ;
140
141 size_t const m_variant_i ;
142
143 map<size_t, size_t> const& m_requested_samples ;
144 map<size_t, size_t>::const_iterator m_requested_sample_i ;
145 size_t m_storage_i ;
146
147 genfile::OrderType m_order_type ;
148 } ;
149
150 struct set_sample_names {
151 typedef map< size_t, size_t > SampleIndexMap ;
152
153 set_sample_names( vector<string>* result, SampleIndexMap* sample_indices ):
154 m_result( result ),
155 m_sample_indices( sample_indices ),
156 m_index(0)
157 {
158 assert( result != 0 ) ;
159 assert( sample_indices != 0 ) ;
160 // assert( sample_indices->size() == result->size() ) ;
161 }
162
163 void operator()( string const& value ) {
164 m_sample_indices->insert( std::make_pair( m_index, m_index ) ) ;
165 (*m_result)[m_index++] = value ;
166 }
167 private:
168 vector<string>* m_result ;
169 SampleIndexMap* m_sample_indices ;
170 size_t m_index ;
171 } ;
172
173 struct set_requested_sample_names {
174 typedef map<string, size_t> RequestedSamples ;
175 typedef map<size_t, size_t> SampleIndexMap ;
176
177 set_requested_sample_names(
178 vector<string>* result,
179 SampleIndexMap* sample_indices,
180 RequestedSamples const& requested_samples
181 ):
182 m_result( result ),
183 m_sample_indices( sample_indices ),
184 m_requested_samples( requested_samples ),
185 m_index(0),
186 m_value_index(0)
187 {
188 assert( result != 0 ) ;
189 assert( sample_indices != 0 ) ;
190 assert( sample_indices->size() == 0 ) ;
191 assert( result->size() == requested_samples.size() ) ;
192 }
193
194 void operator()(string const& value ) {
195 RequestedSamples::const_iterator where = m_requested_samples.find( value ) ;
196 if( where != m_requested_samples.end() ) {
197 (*m_result)[ where->second ] = value ;
198 m_sample_indices->insert( make_pair( m_value_index, where->second ) ) ;
199 }
200 ++m_value_index ;
201 }
202 private:
203 vector<string>* m_result ;
204 SampleIndexMap* m_sample_indices ;
205 RequestedSamples const& m_requested_samples ;
206 size_t m_index ;
207 size_t m_value_index ;
208 } ;
209}
210
211static void get_all_samples(
212 genfile::bgen::View const& view,
213 size_t* number_of_samples,
214 vector<string>* sampleNames,
215 map<size_t, size_t>* requestedSamplesByIndexInDataIndex
216) {
217 *number_of_samples = view.number_of_samples() ;
218 sampleNames->resize( *number_of_samples ) ;
219 view.get_sample_ids( set_sample_names( sampleNames, requestedSamplesByIndexInDataIndex ) ) ;
220}
221
222static void get_requested_samples(
223 genfile::bgen::View const& view,
224 vector<string> const& requestedSamples,
225 size_t* number_of_samples,
226 vector<string>* sampleNames,
227 map<size_t, size_t>* requestedSamplesByIndexInDataIndex
228) {
229 // convert requested sample IDs to a map of requested indices.
230 map<string, size_t> requestedSamplesByName ;
231 for( size_t i = 0; i < requestedSamples.size(); ++i ) {
232 requestedSamplesByName.insert( std::map< string, size_t >::value_type( string( requestedSamples[i] ), i )) ;
233 }
234 if( requestedSamplesByName.size() != requestedSamples.size() ) {
235 throw std::invalid_argument(
236 "load_unsafe(): requiredSamples: expected a list of unique samples with no repeats."
237 ) ;
238 }
239
240 *number_of_samples = requestedSamples.size() ;
241 sampleNames->resize( requestedSamples.size() ) ;
242 view.get_sample_ids( set_requested_sample_names( sampleNames, requestedSamplesByIndexInDataIndex, requestedSamplesByName ) ) ;
243
244 // Check each requested sample has been asked for exactly once
245 // We count distinct samples, among those requested, that we've found in the data
246 // And we also count the min and max index of those samples.
247 // If min = 0 and max = (#requested samples-1) and each sample was unique, we're ok.
248 set<size_t> checkSamples ;
249 size_t minIndex = std::numeric_limits< size_t >::max() ;
250 size_t maxIndex = 0 ;
251
252 for(
253 map<size_t, size_t>::const_iterator p = requestedSamplesByIndexInDataIndex->begin();
254 p != requestedSamplesByIndexInDataIndex->end();
255 ++p
256 ) {
257 checkSamples.insert( p->second ) ;
258 minIndex = std::min( minIndex, p->second ) ;
259 maxIndex = std::max( maxIndex, p->second ) ;
260 }
261 if( checkSamples.size() != requestedSamples.size() || minIndex != 0 || maxIndex != (requestedSamples.size()-1) ) {
262 // Huh. To be most useful, let's print diagnostics
263 // std::cerr << "!! Uh-oh: requested sample indices (data, request) are:\n" ;
264 for(
265 map<size_t, size_t>::const_iterator p = requestedSamplesByIndexInDataIndex->begin();
266 p != requestedSamplesByIndexInDataIndex->end();
267 ++p
268 ) {
269 // std::cerr << p->first << ", " << p->second << ".\n" ;
270 }
271
272 throw std::invalid_argument(
273 // "load_unsafe(): requiredSamples contains a sample not present in the data, or data contains a repeated sample ID."
274 "requested sample not found in file"
275 ) ;
276 }
277}
278
279#endif