diff --git a/lib/galaxy/datatypes/binary.py b/lib/galaxy/datatypes/binary.py index 146447cb0e5c..1a7372f78ac8 100644 --- a/lib/galaxy/datatypes/binary.py +++ b/lib/galaxy/datatypes/binary.py @@ -552,22 +552,29 @@ class _BamOrSam: Helper class to set the metadata common to sam and bam files """ + max_references = 100000 + def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None: try: - bam_file = pysam.AlignmentFile(dataset.get_file_name(), mode="rb") - # TODO: Reference names, lengths, read_groups and headers can become very large, truncate when necessary - dataset.metadata.reference_names = list(bam_file.references) - dataset.metadata.reference_lengths = list(bam_file.lengths) - dataset.metadata.bam_header = dict(bam_file.header.items()) # type: ignore [attr-defined] - dataset.metadata.read_groups = [ - read_group["ID"] for read_group in dataset.metadata.bam_header.get("RG", []) if "ID" in read_group - ] - dataset.metadata.sort_order = dataset.metadata.bam_header.get("HD", {}).get("SO", None) - dataset.metadata.bam_version = dataset.metadata.bam_header.get("HD", {}).get("VN", None) + with pysam.AlignmentFile(dataset.get_file_name(), mode="rb", check_sq=False) as bam_file: + # TODO: Reference names, lengths, read_groups and headers can become very large, truncate when necessary + if bam_file.nreferences <= self.max_references: + dataset.metadata.reference_names = list(bam_file.references) + dataset.metadata.reference_lengths = list(bam_file.lengths) + dataset.metadata.bam_header = dict(bam_file.header.items()) # type: ignore [attr-defined] + dataset.metadata.read_groups = [ + read_group["ID"] + for read_group in dataset.metadata.bam_header.get("RG", []) + if "ID" in read_group + ] + else: + dataset.metadata.metadata_incomplete = True + dataset.metadata.sort_order = bam_file.header.get("HD", {}).get("SO", None) # type: ignore [attr-defined] + dataset.metadata.bam_version = bam_file.header.get("HD", {}).get("VN", None) # type: ignore [attr-defined] except Exception: # Per Dan, don't log here because doing so will cause datasets that # fail metadata to end in the error state - pass + dataset.metadata.metadata_incomplete = True class BamNative(CompressedArchive, _BamOrSam): @@ -656,6 +663,16 @@ class BamNative(CompressedArchive, _BamOrSam): optional=True, no_value={}, ) + MetadataElement( + name="metadata_incomplete", + default=False, + desc="Indicates if metadata is incomplete", + param=MetadataParameter, + readonly=True, + visible=False, + optional=True, + no_value=False, + ) def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None: _BamOrSam().set_meta(dataset, overwrite=overwrite, **kwd) @@ -1054,7 +1071,7 @@ def dataset_content_needs_grooming(self, file_name: str) -> bool: """ # The best way to ensure that BAM files are coordinate-sorted and indexable # is to actually index them. - with pysam.AlignmentFile(filename=file_name) as f: + with pysam.AlignmentFile(filename=file_name, check_sq=False) as f: # The only sure thing we know here is that the sort order can't be coordinate return f.header.get("HD", {}).get("SO") == "coordinate" # type: ignore[attr-defined] @@ -1074,7 +1091,7 @@ def dataset_content_needs_grooming(self, file_name: str) -> bool: """ # The best way to ensure that BAM files are coordinate-sorted and indexable # is to actually index them. - with pysam.AlignmentFile(filename=file_name) as f: + with pysam.AlignmentFile(filename=file_name, check_sq=False) as f: return f.header.get("HD", {}).get("SO") != "queryname" # type: ignore[attr-defined]