48 genoFileType = getFileType(
param.file);
54 if( genoFileType ==
PGEN ){
57 pvar->Load(fileIdx,
true,
true);
61 missingToMean = (genoFileType ==
PBED) ?
62 true :
param.missingToMean;
69 pg =
new RPgenReader();
70 pg->Load(
param.file, pvar, n_samples_psam, sampleIdx1);
82 if( vInfo !=
nullptr)
delete vInfo;
95 void setRegions(
const vector<string> ®ions )
override {
104 varIdx = vs.getIndeces( gr );
108 for(
int i=0; i<dt.nrows(); i++){
114 n_requested_variants = varIdx.size();
123 return number_of_samples;
129 return vInfo->sampleNames;
135 return toString(
param.fileType);
139 return vs.getChromRanges();
145 bool ret = getNextChunk_helper();
152 arma::mat M(matDosage.data(), number_of_samples, vInfo->size(),
false,
true);
162 bool ret = getNextChunk_helper();
169 arma::mat M(matDosage.data(), number_of_samples, vInfo->size(),
false,
true);
176 #ifndef DISABLE_EIGEN
180 bool ret = getNextChunk_helper();
187 Eigen::MatrixXd M = Eigen::Map<Eigen::MatrixXd>(matDosage.data(), number_of_samples, vInfo->size());
197 bool ret = getNextChunk_helper();
204 Eigen::MatrixXd M = Eigen::Map<Eigen::MatrixXd>(matDosage.data(), number_of_samples, vInfo->size());
216 bool ret = getNextChunk_helper();
223 Rcpp::NumericMatrix M(number_of_samples, vInfo->size(), matDosage.data());
224 colnames(M) = Rcpp::wrap( vInfo->getFeatureNames() );
225 rownames(M) = Rcpp::wrap( vInfo->sampleNames );
236 bool ret = getNextChunk_helper();
249 size_t number_of_samples = 0;
250 int n_requested_variants = 0;
252 vector<double> matDosage;
255 RPgenReader *pg =
nullptr;
256 RPvar *pvar =
nullptr;
261 vector<int> sampleIdx1;
265 bool getNextChunk_helper (){
272 int chunkSize = min(
param.
chunkSize, n_requested_variants - currentIdx);
273 chunkSize = max(chunkSize, 0);
276 if( chunkSize == 0)
return false;
279 auto end = min( varIdx.begin() + currentIdx + chunkSize, varIdx.end());
280 vector<int> varIdx_sub = {varIdx.begin() + currentIdx, end};
285 vector<int> varIdx_sub1(varIdx_sub);
286 for(
int &i : varIdx_sub1) i++;
288 pg->ReadList( matDosage, varIdx_sub1, missingToMean);
293 vInfo->
addVariants( subset_vector(dt[
"CHROM"], varIdx_sub),
294 subset_vector(dt[
"POS"], varIdx_sub),
295 subset_vector(dt[
"ID"], varIdx_sub),
296 subset_vector(dt[
"REF"], varIdx_sub),
297 subset_vector(dt[
"ALT"], varIdx_sub));
300 currentIdx += varIdx_sub.size();
309 void process_variants(){
312 if( genoFileType ==
PGEN ){
314 fileIdx = regex_replace(
param.file, regex(
"pgen$"),
"pvar");
319 dt = DataTable( fileIdx,
"#CHROM" );
322 }
else if( genoFileType ==
PBED ){
325 fileIdx = regex_replace(
param.file, regex(
"bed$"),
"bim");
329 dt = DataTable( fileIdx );
330 dt.setColNames({
"CHROM",
"ID",
"CM",
"POS",
"ALT",
"REF"});
332 throw logic_error(
"Not valid genotype file extension: " +
param.file);
336 vs = VariantSet(dt[
"CHROM"], stoi_vec(dt[
"POS"]));
342 void process_samples(){
350 if(
param.fileSamples.compare(
"") != 0){
351 fileSamples =
param.fileSamples;
353 if( genoFileType ==
PGEN ){
355 fileSamples = regex_replace(
param.file, regex(
"pgen$"),
"psam");
356 }
else if( genoFileType ==
PBED ){
358 fileSamples = regex_replace(
param.file, regex(
"bed$"),
"fam");
363 if( regex_search(fileSamples, regex(
"psam$")) ){
367 dt2 = DataTable(fileSamples,
"#IID");
369 }
else if( regex_search(fileSamples, regex(
"fam$")) ){
373 dt2 = DataTable( fileSamples,
"");
376 vector<string> names = {
"FID",
"IID",
"PID",
"MID",
"SEX",
"PHENO"};
377 vector<string> names_sub(names.begin(), names.begin() + dt.ncols());
378 dt2.setColNames(names_sub);
380 throw logic_error(
"Not valid sample file extension: " + fileSamples);
384 vector<string> SamplesNames = dt2[
"IID"];
385 n_samples_psam = SamplesNames.size();
388 vector<int> sampleIdx;
392 if(
param.samples.compare(
"-") != 0 ){
395 vector<string> requestedSamples;
398 boost::split(requestedSamples,
param.samples, boost::is_any_of(
"\t,\n"));
403 unordered_map<string,int> map_sn;
404 for(
int i=0; i<SamplesNames.size(); i++){
405 map_sn.emplace(SamplesNames[i], i);
410 for(
string & name : requestedSamples){
411 if( map_sn.count(name) == 0){
412 throw logic_error(
"Sample id not found: " + name);
414 sampleIdx.push_back( map_sn[name] );
416 sort(sampleIdx.begin(), sampleIdx.end());
420 vector<string> requestedSamples_ordered;
421 for(
int i : sampleIdx){
422 requestedSamples_ordered.push_back( SamplesNames[i] );
427 vInfo =
new VariantInfo( requestedSamples_ordered );
428 number_of_samples = requestedSamples_ordered.size();
431 sampleIdx1.assign(sampleIdx.begin(), sampleIdx.end());
432 for(
int &i : sampleIdx1) i++;
434 vInfo =
new VariantInfo( SamplesNames );
435 number_of_samples = SamplesNames.size();
438 sampleIdx1.resize(number_of_samples);
439 iota(begin(sampleIdx1), end(sampleIdx1), 1);
void addVariants(const vector< string > &chr, const vector< string > &pos, const vector< string > &id, const vector< string > &allele1, const vector< string > &allele2)
Definition VariantInfo.h:53