Skip to content

Commit 5a76add

Browse files
QYuFongQiuYuFong
andauthored
Remove the use of pickle to prevent security issues (#1108)
Co-authored-by: qiuyufeng <qiuyufeng3@huawei.com>
1 parent c5dccc5 commit 5a76add

File tree

24 files changed

+243
-127
lines changed

24 files changed

+243
-127
lines changed

examples/animatediff/tests/test_mm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def load_pt_rt_states(mm_idx, folder):
2020
with open(args_fp, "r") as fp:
2121
args = json.load(fp)
2222

23-
inps = np.load(inps_fp, allow_pickle=True)
23+
inps = np.load(inps_fp)
2424
inps_dict = {}
2525
for name in inps:
2626
if inps[name].ndim > 0:
@@ -29,7 +29,7 @@ def load_pt_rt_states(mm_idx, folder):
2929
else:
3030
inps_dict[name] = None
3131

32-
output = np.load(outs_fp, allow_pickle=True)["output"]
32+
output = np.load(outs_fp)["output"]
3333

3434
return args, inps_dict, output
3535

examples/hunyuan_dit/IndexKits/index_kits/dataset/config_parse.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# Adapted from https://github.com/Tencent-Hunyuan/HunyuanDiT to work with MindSpore.
22
import json
33
import pathlib
4-
import pickle
54
from collections import defaultdict
65
from glob import glob
76
from pathlib import Path
@@ -108,9 +107,7 @@ def load_md5s(paths: List[pathlib.Path], merge=True):
108107
if not path.exists():
109108
raise ValueError(f"Path not found: {path}")
110109
if path.suffix == ".pkl":
111-
with path.open("rb") as f:
112-
md5s = pickle.load(f)
113-
assert isinstance(md5s, (set, dict)), f"Invalid type: {type(md5s)}"
110+
raise ValueError("Loading pickle file is unsafe, please use another file type.")
114111
elif path.suffix == ".json":
115112
with path.open() as f:
116113
md5s = json.load(f)
Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,18 @@
1-
import pickle
2-
31
import numpy as np
42

53

6-
def load_pkl(file_path):
7-
with open(file_path, "rb") as f:
8-
data = pickle.load(f)
9-
return data
4+
def load_np(file_path: str):
5+
with np.load(file_path) as data:
6+
return [data[key] for key in data.files]
107

118

129
def compare_arrays(arr1, arr2, rtol=1e-05, atol=1e-08):
1310
return np.allclose(arr1, arr2, rtol=rtol, atol=atol)
1411

1512

16-
def compare_pkl_files(file_path1, file_path2):
17-
data1 = load_pkl(file_path1)
18-
data2 = load_pkl(file_path2)
13+
def compare_npz_files(file_path1, file_path2):
14+
data1 = load_np(file_path1)
15+
data2 = load_np(file_path2)
1916

2017
if len(data1) != len(data2):
2118
print(
@@ -33,11 +30,11 @@ def compare_pkl_files(file_path1, file_path2):
3330
# Example usage
3431
max_batches_to_save = 5
3532
for i_batch in range(max_batches_to_save):
36-
file_path1 = f"./rank0_batch{i_batch}.pkl"
37-
file_path2 = f"./rank1_batch{i_batch}.pkl"
33+
file_path1 = f"./rank0_batch{i_batch}.npz"
34+
file_path2 = f"./rank1_batch{i_batch}.npz"
3835
print(f"Comparing rank0 and rank1 from batch {i_batch}")
39-
compare_pkl_files(file_path1, file_path2)
36+
compare_npz_files(file_path1, file_path2)
4037

4138
if i_batch > 0:
42-
print(f"Comparing batch{i_batch} and batch{i_batch-1} from rank0")
43-
compare_pkl_files(f"./rank0_batch{i_batch}.pkl", f"./rank0_batch{i_batch-1}.pkl")
39+
print(f"Comparing batch{i_batch} and batch{i_batch - 1} from rank0")
40+
compare_npz_files(f"./rank0_batch{i_batch}.npz", f"./rank0_batch{i_batch - 1}.npz")

examples/hunyuanvideo/tests/parallel/test_sequence_parallel_data.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
import logging
22
import os
3-
import pickle
43
import sys
54
from pathlib import Path
65
from typing import Dict, Tuple, Union
76

7+
import numpy as np
88
from jsonargparse import ActionConfigFile, ArgumentParser
99
from jsonargparse.typing import path_type
1010

@@ -123,11 +123,10 @@ def main(args):
123123
item = item.float().asnumpy()
124124
data.append(item)
125125

126-
save_path = f"{save_path_prefix}{batch_count}.pkl"
126+
save_path = f"{save_path_prefix}{batch_count}.npz"
127127
save_path = str(Path(save_path).absolute())
128128

129-
with open(save_path, "wb") as f:
130-
pickle.dump(data, f)
129+
np.savez(save_path, *data)
131130

132131
batch_count += 1
133132
print(f"The batch {batch_count} data has been saved to {save_path}")

examples/instantmesh/README.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,37 @@ One needs to patch `mindcv.models.vgg` in L62 to enable conv kernel bias to alig
118118
+ conv2d = nn.Conv2d(in_channels, v, kernel_size=3, pad_mode="pad", padding=1, has_bias=True)
119119
```
120120

121+
### ⚠️ Warning:
122+
If the dataset has pickle files, you need to pay attention to the following security risks.
123+
124+
- Loading Pickle files will lead to the following risks:
125+
- Remote Code Execution (RCE)
126+
- Sensitive data leakage
127+
- System compromise
128+
- By using pickle files, you acknowledge the risks and agree to:
129+
- Covert pickle files in isolated environments (e.g., sandbox/container)
130+
- Never load `.pkl` files from untrusted sources
131+
132+
For more information, review the [Documentation](https://docs.python.org/3/library/pickle.html) for the ``pickle`` module.
133+
134+
Taking the following training dataset as an example:
135+
```shell
136+
unzip training_examples.zip && tree training_examples
137+
```
138+
```text
139+
training_examples
140+
├─input
141+
│ └──uid_0
142+
│ ├─000.png
143+
│ └──meta.pkl
144+
├─target
145+
│ └──uid_0
146+
│ └──000.png
147+
└──uid_set.pkl
148+
```
149+
The data type stored in ``meta.pkl`` is `List[np.ndarray]`, and the data type stored in ``uid_set.pkl`` is `List[str]`.
150+
We recommend that you convert the ``meta.pkl`` file to ``meta.npz`` format and convert the ``uid_set.pkl`` file to ``uid_set.json`` format by yourself before training.
151+
121152
### Data Curation
122153
Following the original paper, we used Blender to render multiview frames for a 3D object in `.obj` for training. Typically for overfitting, three 3D objects from the objaverse dataset are used. We rendered 5 arbitral views for each object with the corresponding camera parameters extracted.
123154

examples/instantmesh/configs/instant-nerf-large-train.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ data:
3232
target: data.objaverse.ObjaverseDataset
3333
params:
3434
root_dir: YOUR_PATH_DATA # for overfitting exp
35-
meta_fname: uid_set.pkl
35+
meta_fname: uid_set.json
3636
input_image_dir: input
3737
target_image_dir: input
3838
input_view_num: 3

examples/instantmesh/data/objaverse.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
import json
33
import math
44
import os
5-
import pickle
65
import sys
76
from pathlib import Path
87

@@ -24,9 +23,14 @@
2423
from mindspore.dataset.vision import Inter, Resize, ToPIL
2524

2625

27-
def read_pickle(pkl_path):
28-
with open(pkl_path, "rb") as f:
29-
return pickle.load(f)
26+
def read_np(np_path):
27+
with np.load(np_path) as data:
28+
return [data[key] for key in data.files]
29+
30+
31+
def read_json(json_path):
32+
with open(json_path, "r") as f:
33+
return json.load(f)
3034

3135

3236
def read_txt2list(txt_path):
@@ -38,11 +42,6 @@ def read_txt2list(txt_path):
3842
return list_entry
3943

4044

41-
def read_json(json_path):
42-
with open(json_path) as f:
43-
return json.load(f)
44-
45-
4645
def random_crop_return_params(imgs, height, width):
4746
"""imgs: (b h w c)"""
4847
assert imgs.shape[1] >= height
@@ -63,7 +62,7 @@ class ObjaverseDataset:
6362
def __init__(
6463
self,
6564
root_dir="training_examples/",
66-
meta_fname="uid_set.pkl",
65+
meta_fname="uid_set.json",
6766
input_image_dir="input",
6867
target_image_dir="input",
6968
input_view_num=6,
@@ -95,9 +94,11 @@ def __init__(
9594
]
9695

9796
if meta_fname == "uid_set.pkl":
98-
self.paths = read_pickle(os.path.join(root_dir, meta_fname))[-3:]
97+
raise TypeError("Loading pickle file is unsafe, please use another file type.")
98+
elif meta_fname == "uid_set.json":
99+
self.paths = read_json(os.path.join(root_dir, meta_fname))[-3:]
99100
# [:1] # only takes the first scene for debugging
100-
print("dataset read pickle")
101+
print("dataset read json")
101102
elif meta_fname.split(".")[-1] == "txt":
102103
self.paths = read_txt2list(os.path.join(root_dir, meta_fname))
103104
print("reading the fixed pose target list as the dataset")
@@ -236,7 +237,7 @@ def __getitem__(self, index):
236237
alpha_list = []
237238
pose_list = []
238239

239-
K, azimuths, elevations, distances, cam_poses = read_pickle(os.path.join(input_image_path, "meta.pkl"))
240+
K, azimuths, elevations, distances, cam_poses = read_np(os.path.join(input_image_path, "meta.npz"))
240241
input_cameras = cam_poses
241242
for idx in input_indices:
242243
image, alpha = self.load_im(os.path.join(input_image_path, "%03d.png" % idx), bg_white)

examples/opensora_pku/opensora/dataset/t2v_datasets.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import logging
77
import math
88
import os
9-
import pickle
109
import random
1110
import time
1211
from collections import Counter
@@ -250,8 +249,7 @@ def define_frame_index(self, data):
250249
with open(anno, "r") as f:
251250
sub_list = json.load(f)
252251
elif anno.endswith(".pkl"):
253-
with open(anno, "rb") as f:
254-
sub_list = pickle.load(f)
252+
raise TypeError("Loading pickle file is unsafe, please use another file type.")
255253
for index, i in enumerate(tqdm(sub_list)):
256254
cnt += 1
257255
path = os.path.join(sub_root, i["path"])

examples/sv3d/configs/sv3d_u_train.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ train:
2727
class_path: data.mulvideo_dataset.MulviewVideoDataset
2828
init_args:
2929
root_dir: itmh_training_data_9 # for overfitting exp
30-
metadata: uid_set.pkl
30+
metadata: uid_set.json
3131
image_dir: target # for overfitting w/o pose
3232
frames: 5 # to accomodate training with obj-rendering dataset
3333

examples/sv3d/data/mulvideo_dataset.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1+
import json
12
import logging
23
import os
3-
import pickle
44
import sys
55
from pathlib import Path
66
from typing import Any, List, Tuple
@@ -16,9 +16,9 @@
1616
_logger = logging.getLogger("")
1717

1818

19-
def read_pickle(pkl_path):
20-
with open(pkl_path, "rb") as f:
21-
return pickle.load(f)
19+
def read_json(json_path):
20+
with open(json_path, "r") as f:
21+
return json.load(f)
2222

2323

2424
def load_im(path, color):
@@ -39,7 +39,7 @@ def __init__(
3939
frames: int,
4040
):
4141
self.root_dir = Path(root_dir)
42-
self.paths = read_pickle(os.path.join(root_dir, metadata))
42+
self.paths = read_json(os.path.join(root_dir, metadata))
4343
self.image_dir = image_dir
4444
self._frames = frames
4545
self._bg_white = [1.0, 1.0, 1.0]

0 commit comments

Comments
 (0)