Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions c/CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ Unreleased
fully support TSK_ISOLATED_NOT_MISSING not being set for internal nodes.
(:user:`benjeffery`, :pr:`3313`)

- ``tsk_variant_init`` now accepts duplicate user-provided samples. When
a samples array contains the same node more than once, genotypes are
produced for each occurrence. This relaxes a previous restriction that
raised ``TSK_ERR_DUPLICATE_SAMPLE``.
(:user:`benjeffery`, :issue:`XXXX`, :pr:`YYYY`)


--------------------
[1.2.0] - 2025-09-24
Expand Down
35 changes: 30 additions & 5 deletions c/tests/test_genotypes.c
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,34 @@ test_single_tree_binary_alphabet(void)
tsk_treeseq_free(&ts);
}

static void
test_duplicate_samples_repeat_genotypes(void)
{
int ret = 0;
tsk_treeseq_t ts;
tsk_vargen_t vargen;
tsk_variant_t *var;
/* Use a duplicate of sample 1 */
tsk_id_t samples[] = { 0, 1, 1, 2 };

tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL,
single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0);
ret = tsk_vargen_init(&vargen, &ts, samples, 4, NULL, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);

ret = tsk_vargen_next(&vargen, &var);
CU_ASSERT_EQUAL_FATAL(ret, 1);
CU_ASSERT_EQUAL(var->num_samples, 4);
CU_ASSERT_EQUAL(var->site.id, 0);
CU_ASSERT_EQUAL(var->genotypes[0], 0);
CU_ASSERT_EQUAL(var->genotypes[1], 0);
CU_ASSERT_EQUAL(var->genotypes[2], 0);
CU_ASSERT_EQUAL(var->genotypes[3], 1);

tsk_vargen_free(&vargen);
tsk_treeseq_free(&ts);
}

static void
test_single_tree_non_samples(void)
{
Expand Down Expand Up @@ -672,11 +700,6 @@ test_single_tree_errors(void)
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS);
tsk_vargen_free(&vargen);

samples[0] = 3;
ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, 0);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_DUPLICATE_SAMPLE);
tsk_vargen_free(&vargen);

tsk_treeseq_free(&ts);
}

Expand Down Expand Up @@ -1211,6 +1234,8 @@ main(int argc, char **argv)
{ "test_single_tree_user_alleles", test_single_tree_user_alleles },
{ "test_single_tree_char_alphabet", test_single_tree_char_alphabet },
{ "test_single_tree_binary_alphabet", test_single_tree_binary_alphabet },
{ "test_duplicate_samples_repeat_genotypes",
test_duplicate_samples_repeat_genotypes },
{ "test_single_tree_non_samples", test_single_tree_non_samples },
{ "test_isolated_internal_node", test_isolated_internal_node },
{ "test_single_tree_errors", test_single_tree_errors },
Expand Down
44 changes: 32 additions & 12 deletions c/tskit/genotypes.c
Original file line number Diff line number Diff line change
Expand Up @@ -100,25 +100,30 @@ variant_init_samples_and_index_map(tsk_variant_t *self,
self->alt_samples = tsk_malloc(num_samples_alloc * sizeof(*samples));
self->alt_sample_index_map
= tsk_malloc(num_nodes * sizeof(*self->alt_sample_index_map));
if (self->alt_samples == NULL || self->alt_sample_index_map == NULL) {
self->alt_sample_next_index
= tsk_malloc(num_samples_alloc * sizeof(*self->alt_sample_next_index));
if (self->alt_samples == NULL || self->alt_sample_index_map == NULL
|| self->alt_sample_next_index == NULL) {
ret = tsk_trace_error(TSK_ERR_NO_MEMORY);
goto out;
}
tsk_memcpy(self->alt_samples, samples, num_samples * sizeof(*samples));
tsk_memset(self->alt_sample_index_map, 0xff,
num_nodes * sizeof(*self->alt_sample_index_map));
/* Create the reverse mapping */
/* Initialise next-index links to NULL */
tsk_memset(self->alt_sample_next_index, 0xff,
num_samples_alloc * sizeof(*self->alt_sample_next_index));
/* Create the reverse mapping. Allow duplicates by building a per-node linked list
* of indices into the samples array. */
for (j = 0; j < num_samples; j++) {
u = samples[j];
if (u < 0 || u >= (tsk_id_t) num_nodes) {
ret = tsk_trace_error(TSK_ERR_NODE_OUT_OF_BOUNDS);
goto out;
}
if (self->alt_sample_index_map[u] != TSK_NULL) {
ret = tsk_trace_error(TSK_ERR_DUPLICATE_SAMPLE);
goto out;
}
self->alt_sample_index_map[samples[j]] = (tsk_id_t) j;
/* Push-front onto the index list for node u */
self->alt_sample_next_index[j] = self->alt_sample_index_map[u];
self->alt_sample_index_map[u] = (tsk_id_t) j;
}
out:
return ret;
Expand Down Expand Up @@ -267,6 +272,7 @@ tsk_variant_free(tsk_variant_t *self)
tsk_safe_free(self->samples);
tsk_safe_free(self->alt_samples);
tsk_safe_free(self->alt_sample_index_map);
tsk_safe_free(self->alt_sample_next_index);
tsk_safe_free(self->traversal_stack);
return 0;
}
Expand Down Expand Up @@ -360,7 +366,7 @@ tsk_variant_traverse(
const tsk_id_t *restrict left_child = self->tree.left_child;
const tsk_id_t *restrict right_sib = self->tree.right_sib;
const tsk_id_t *restrict sample_index_map = self->sample_index_map;
tsk_id_t u, v, sample_index;
tsk_id_t u, v, sample_index, idx;
int stack_top;
int no_longer_missing = 0;

Expand All @@ -370,11 +376,24 @@ tsk_variant_traverse(
u = stack[stack_top];
sample_index = sample_index_map[u];
if (sample_index != TSK_NULL) {
ret = visit(self, sample_index, derived);
if (ret < 0) {
goto out;
if (self->alt_samples != NULL) {
/* Iterate all duplicate indices for this node */
idx = sample_index;
while (idx != TSK_NULL) {
ret = visit(self, idx, derived);
if (ret < 0) {
goto out;
}
no_longer_missing += ret;
idx = self->alt_sample_next_index[idx];
}
} else {
ret = visit(self, sample_index, derived);
if (ret < 0) {
goto out;
}
no_longer_missing += ret;
}
no_longer_missing += ret;
}
stack_top--;
for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) {
Expand Down Expand Up @@ -609,6 +628,7 @@ tsk_variant_restricted_copy(const tsk_variant_t *self, tsk_variant_t *other)
other->sample_index_map = NULL;
other->alt_samples = NULL;
other->alt_sample_index_map = NULL;
other->alt_sample_next_index = NULL;
other->user_alleles_mem = NULL;

total_len = 0;
Expand Down
4 changes: 4 additions & 0 deletions c/tskit/genotypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ typedef struct {
tsk_flags_t options;
tsk_id_t *alt_samples;
tsk_id_t *alt_sample_index_map;
/* For user-specified samples (alt_samples != NULL), map from an index in the
* samples array to the next index for the same node, forming a linked list.
* The head of the list for node u is alt_sample_index_map[u]. */
tsk_id_t *alt_sample_next_index;

} tsk_variant_t;

Expand Down
38 changes: 38 additions & 0 deletions python/tests/test_genotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,20 @@ def test_genotype_matrix(self, samples):
assert np.array_equal(G, G2)
assert G2.dtype == np.int32

def test_genotype_matrix_duplicate_samples(self):
ts = self.get_tree_sequence()
s = ts.samples()
# Duplicate the second sample
dup_samples = [int(s[0]), int(s[1]), int(s[1]), int(s[2])]
G_dup = ts.genotype_matrix(samples=dup_samples)
G_unique = ts.genotype_matrix(samples=[int(s[0]), int(s[1]), int(s[2])])
assert G_dup.shape[1] == 4
assert G_unique.shape[1] == 3
assert np.array_equal(G_dup[:, 0], G_unique[:, 0])
assert np.array_equal(G_dup[:, 1], G_unique[:, 1])
assert np.array_equal(G_dup[:, 2], G_unique[:, 1])
assert np.array_equal(G_dup[:, 3], G_unique[:, 2])

def test_recurrent_mutations_over_samples(self):
ts = self.get_tree_sequence()
tables = ts.dump_tables()
Expand Down Expand Up @@ -2823,3 +2837,27 @@ def test_variant_repr(self, ts_fixture):
assert re.search(rf"position={v.position}", str_rep)
assert re.search(rf"\'has_missing_data\': {v.has_missing_data}", str_rep)
assert re.search(rf"\'isolated_as_missing\': {v.isolated_as_missing}", str_rep)

def test_variants_allow_duplicate_samples(self):
ts = msprime.sim_ancestry(4, random_seed=123)
ts = tsutil.insert_branch_sites(ts)
samples = ts.samples()
s0 = int(samples[0])
s1 = int(samples[1])
dup_samples = [s0, s1, s1]
# Iterate until we find a site where the two base samples differ,
# so the duplicate check is informative.
for v_dup, v_unique in zip(
ts.variants(samples=dup_samples), ts.variants(samples=[s0, s1])
):
assert len(v_dup.genotypes) == 3
# Ensure the pattern can detect duplicate errors
if v_unique.genotypes[0] != v_unique.genotypes[1]:
assert v_dup.genotypes[0] == v_unique.genotypes[0]
assert v_dup.genotypes[1] == v_unique.genotypes[1]
assert v_dup.genotypes[2] == v_unique.genotypes[1]
break
else:
pytest.fail(
"No variant with differing genotypes between selected samples found"
)
2 changes: 1 addition & 1 deletion python/tests/test_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,7 @@ def test_empty_individuals(self):
def test_duplicate_individuals(self):
ts = msprime.sim_ancestry(3, random_seed=2)
ts = tsutil.insert_branch_sites(ts)
with pytest.raises(tskit.LibraryError, match="TSK_ERR_DUPLICATE_SAMPLE"):
with pytest.raises(ValueError, match="Duplicate individuals"):
ts.as_vcf(individuals=[0, 0], allow_position_zero=True)

def test_samples_with_and_without_individuals(self):
Expand Down
12 changes: 9 additions & 3 deletions python/tskit/trees.py
Original file line number Diff line number Diff line change
Expand Up @@ -5444,7 +5444,9 @@ def variants(
By default, genotypes are generated for all samples. The ``samples``
parameter allows us to specify the nodes for which genotypes are
generated; output order of genotypes in the returned variants
corresponds to the order of the samples in this list. It is also
corresponds to the order of the samples in this list. Duplicate node IDs
are permitted and will result in repeated entries in the returned genotypes.
It is also
possible to provide **non-sample** nodes as an argument here, if you
wish to generate genotypes for (e.g.) internal nodes. Missingness is
detected for any requested node (sample or non-sample) when
Expand Down Expand Up @@ -5567,8 +5569,9 @@ def genotype_matrix(

:param array_like samples: An array of node IDs for which to generate
genotypes. If ``None`` (default), generate genotypes for all sample
nodes. Non-sample nodes may also be provided, in which case genotypes
will be generated for those nodes too.
nodes. Duplicate node IDs are permitted and will result in repeated
columns in the returned matrix. Non-sample nodes may also be provided,
in which case genotypes will be generated for those nodes too.
:param bool isolated_as_missing: If True, the genotype value assigned to
isolated nodes without mutations (samples or non-samples) is
:data:`.MISSING_DATA` (-1). If False, such nodes will be
Expand Down Expand Up @@ -11075,6 +11078,9 @@ def map_to_vcf_model(
individuals = np.arange(self.num_individuals, dtype=np.int32)
if len(individuals) == 0:
raise ValueError("No individuals specified")
# Reject duplicate individuals explicitly to preserve VCF semantics
if len(np.unique(individuals)) != len(individuals):
raise ValueError("Duplicate individuals specified")
if min(individuals) < 0 or max(individuals) >= self.num_individuals:
raise ValueError("Invalid individual ID")

Expand Down