Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 30 additions & 13 deletions lib/galaxy/datatypes/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,22 +552,29 @@ class _BamOrSam:
Helper class to set the metadata common to sam and bam files
"""

max_references = 100000

def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None:
try:
bam_file = pysam.AlignmentFile(dataset.get_file_name(), mode="rb")
# TODO: Reference names, lengths, read_groups and headers can become very large, truncate when necessary
dataset.metadata.reference_names = list(bam_file.references)
dataset.metadata.reference_lengths = list(bam_file.lengths)
dataset.metadata.bam_header = dict(bam_file.header.items()) # type: ignore [attr-defined]
dataset.metadata.read_groups = [
read_group["ID"] for read_group in dataset.metadata.bam_header.get("RG", []) if "ID" in read_group
]
dataset.metadata.sort_order = dataset.metadata.bam_header.get("HD", {}).get("SO", None)
dataset.metadata.bam_version = dataset.metadata.bam_header.get("HD", {}).get("VN", None)
with pysam.AlignmentFile(dataset.get_file_name(), mode="rb", check_sq=False) as bam_file:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm going to approve because your time is valuable and I think you have more important things to work on but I do feel like adding a metadata element to indicate these lists are truncated would be the ideal thing to do in this case so the meaning of these fields is consistent across bam files.

# TODO: Reference names, lengths, read_groups and headers can become very large, truncate when necessary
if bam_file.nreferences <= self.max_references:
dataset.metadata.reference_names = list(bam_file.references)
dataset.metadata.reference_lengths = list(bam_file.lengths)
dataset.metadata.bam_header = dict(bam_file.header.items()) # type: ignore [attr-defined]
dataset.metadata.read_groups = [
read_group["ID"]
for read_group in dataset.metadata.bam_header.get("RG", [])
if "ID" in read_group
]
else:
dataset.metadata.metadata_incomplete = True
dataset.metadata.sort_order = bam_file.header.get("HD", {}).get("SO", None) # type: ignore [attr-defined]
dataset.metadata.bam_version = bam_file.header.get("HD", {}).get("VN", None) # type: ignore [attr-defined]
except Exception:
# Per Dan, don't log here because doing so will cause datasets that
# fail metadata to end in the error state
pass
dataset.metadata.metadata_incomplete = True


class BamNative(CompressedArchive, _BamOrSam):
Expand Down Expand Up @@ -656,6 +663,16 @@ class BamNative(CompressedArchive, _BamOrSam):
optional=True,
no_value={},
)
MetadataElement(
name="metadata_incomplete",
default=False,
desc="Indicates if metadata is incomplete",
param=MetadataParameter,
readonly=True,
visible=False,
optional=True,
no_value=False,
)

def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None:
_BamOrSam().set_meta(dataset, overwrite=overwrite, **kwd)
Expand Down Expand Up @@ -1054,7 +1071,7 @@ def dataset_content_needs_grooming(self, file_name: str) -> bool:
"""
# The best way to ensure that BAM files are coordinate-sorted and indexable
# is to actually index them.
with pysam.AlignmentFile(filename=file_name) as f:
with pysam.AlignmentFile(filename=file_name, check_sq=False) as f:
# The only sure thing we know here is that the sort order can't be coordinate
return f.header.get("HD", {}).get("SO") == "coordinate" # type: ignore[attr-defined]

Expand All @@ -1074,7 +1091,7 @@ def dataset_content_needs_grooming(self, file_name: str) -> bool:
"""
# The best way to ensure that BAM files are coordinate-sorted and indexable
# is to actually index them.
with pysam.AlignmentFile(filename=file_name) as f:
with pysam.AlignmentFile(filename=file_name, check_sq=False) as f:
return f.header.get("HD", {}).get("SO") != "queryname" # type: ignore[attr-defined]


Expand Down
Loading