% Simulate and reconstruct a genotype vector in a specific setting.
% The simulation is done on one SNP
%
% ============ Required inputs ==================
% n_samples - number of individual to be screened.
%
% k_pools = number of pools we use.
%
% s = number of carriers within n_samples.
%
% mean_reads = mean number of reads per SNP in a lane
%
% read_error_prob = probability for an incorrect read by the machine
%
% sigma_pipette = The actual amount of DNA taken from a certain individual is
% a Gaussian random variable.
% sigma_pipette is its standard deviation (see paper)
%
%
% ============ Optional inputs ====================
% additional input which may be discarded:
%
% sqrtFlag = <1 simulates a Bernoulli(p) sensing matrix. Default is p=0.5
% sqrtFlag=1 leaves only sqrt(n_samples) non zero entries in
% each row, i.e., pool. The default value of sqrtFlag is 0
%
% forceBBflag = if equals 'replace half by 2', then half of the
% non-zero entries in the genotype vector are replaces
% by the value 2, to simulate the "BB" case, .i.e homozygous rare-allele.
% Any other value for forceBBflag simulates the regular "AB" case.
% The default value of forceBBflag corresponds to the "AB" case.
%
%
% =========== The output =========================
%
% x = is the simulated genotype vector to be reconstructed.
%
% fractionalOutput = is the CS original output, with fractional values
%
% discreteOutput = is a "rounded" version of the fractionalOutput,
% based on the noise level (see paper)
%
% num_reconstruction_errors = how many individuals were not reconstructed correcly
%
% pooling_matrix = a matrix representing the pooling design
%
% Noam Shental, Amnon Amir, Or Zuk, 2010
%
% This software is being provided "as is", without any express or
% implied warranty. In particular, the authors do not make any
% representation or warranty of any kind concerning the merchantability
% of this software or its fitness for any particular purpose."
%================================================
function [x fractionalOutput discreteOutput num_reconstruction_errors ...
pooling_matrix] = ...
simulateCSseq(n_samples, k_pools, s, mean_reads, ...
sigma_pipette, read_error_prob, sqrtFlag, forceBBflag)
x = zeros(n_samples,1); % create genotype vector x. It is chosen randomly
rand('twister',sum(100*clock));
position = randperm(n_samples);
if(s < 1) % here sample binomial with prob. s
s = binornd(n_samples, s);
end
x(position(1:s)) = 1;
if exist('forceBBflag') % consider the homozygous rare-allele
if strcmp(forceBBflag,'replace half by 2')
twoPos = randperm(s);
x(position(twoPos(1:round(s/2)))) = 2;
end
end
%end genotype vector issues
%=============
%=================================
% create the sensing matrix M
if ~exist('sqrtFlag', 'var')
sqrtFlag = 0.5; % use 0.5 case
end
if (sqrtFlag == 1) % the sqrt(n_samples) case
sqrtN = round(sqrt(n_samples));
M = zeros(k_pools, n_samples);
for jj=1:k_pools
tmpPerm = randperm(n_samples);
M(jj,tmpPerm(1:sqrtN)) = 1;
end
else % the Bernoulli 0.5*n_samples case
M = double(rand(k_pools, n_samples) < sqrtFlag);
end
pooling_matrix = M; % output pooling matrix
M0 = normalizeRows(M,2)/2; % the original normalized sensing matrix - before the pipette noise, which is unknown, is added
Z = randn(size(M)) .* sigma_pipette; % take noise in DNA quanties. The noise affects only the non-zero entries in the matrix
M = max(M + M .* Z, 0); % add pipette noise
%end sensing matrix issue
% ================================
% Equation 6 in the paper
qCorrect = (normalizeRows(M,2) * x) ./ 2; % vector of probabilities for a read to be one (divided by two to account for two chromosomes)
% Equation 8 in the paper
qEff = qCorrect .* (1-read_error_prob) + (1-qCorrect) .* read_error_prob; % add the effect of the read errors by convolution
% set the actual number of reads according to a Gamma distribution (based on Prabhu and Pe'er)
reads = round(gamrnd(mean_reads, 1));
% take the measurement based on a binomial distribution
large_inds = find(reads .* qEff > 50); % above 50 we can use Gaussian approximation
small_inds = setdiff(1:length(qEff), large_inds);
qReads = zeros(length(qEff),1); % make a column vector
if(~isempty(small_inds))
qReads(small_inds) = binornd(reads, qEff(small_inds)) ./ reads;
end
if(~isempty(large_inds)) % use Gaussian approximation
qReads(large_inds) = round(normrnd(reads .* qEff(large_inds), ...
sqrt(reads .* qEff(large_inds) .* (1-qEff(large_inds))))) ./ reads;
end
% notice that in case reads*qEff and reads*qEff*(1-qEff) are large enough, they can be approximated using by a Gaussian - consider this, as this
% may be a lot faster, in case the number of reads is large.
% In this version we assume the read error is known, and use Equation:
qMeasurement = (qReads-read_error_prob)./(1-2*read_error_prob);
%==========================================
% perform reconstruction
tau = 0.005*max(abs(M0'*qMeasurement));
% fractional solution
fractionalOutput = applyGPSR(qMeasurement,M0,tau);
% look for the solution with minimal squre error with the measurements
discreteOutput = modifySolution(fractionalOutput,M0,qMeasurement);
num_reconstruction_errors = sum(x ~= discreteOutput); % Compute success
%end of reconstruction ==============================================