modelscope · Jintao-Huang · May 6, 2025 · May 6, 2025 · May 6, 2025 · May 6, 2025
diff --git a/README.md b/README.md
@@ -120,10 +120,10 @@ Running Environment:
 | transformers | >=4.33       | 4.51      |                                           |
 | modelscope   | >=1.19       |             |                                           |
 | peft | >=0.11,<0.16 | ||
-| trl | >=0.13,<0.17 | 0.16 |RLHF|
+| trl | >=0.13,<0.18 | 0.17 |RLHF|
 | deepspeed    | >=0.14       | 0.14.5 | Training                                  |
-| vllm         | >=0.5.1      | 0.7.3/0.8.4       | Inference/Deployment/Evaluation           |
-| lmdeploy     | >=0.5        | 0.7.2.post1       | Inference/Deployment/Evaluation           |
+| vllm         | >=0.5.1      | 0.7.3/0.8       | Inference/Deployment/Evaluation           |
+| lmdeploy     | >=0.5        | 0.8       | Inference/Deployment/Evaluation           |
 | evalscope | >=0.11       |  | Evaluation |
 
 For more optional dependencies, you can refer to [here](https://github.com/modelscope/ms-swift/blob/main/requirements/install_all.sh).

diff --git a/README_CN.md b/README_CN.md
@@ -115,10 +115,10 @@ pip install -e .
 | transformers | >=4.33       | 4.51 ||
 | modelscope | >=1.19       |  ||
 | peft | >=0.11,<0.16 | ||
-| trl | >=0.13,<0.17 | 0.16 |RLHF|
+| trl | >=0.13,<0.18 | 0.17 |RLHF|
 | deepspeed | >=0.14       | 0.14.5 |训练|
-| vllm | >=0.5.1      | 0.7.3/0.8.4 |推理/部署/评测|
-| lmdeploy | >=0.5        | 0.7.2.post1 |推理/部署/评测|
+| vllm | >=0.5.1      | 0.7.3/0.8 |推理/部署/评测|
+| lmdeploy | >=0.5        | 0.8 |推理/部署/评测|
 | evalscope | >=0.11       | |评测|
 
 更多可选依赖可以参考[这里](https://github.com/modelscope/ms-swift/blob/main/requirements/install_all.sh)。

diff --git a/docs/source/GetStarted/SWIFT安装.md b/docs/source/GetStarted/SWIFT安装.md
@@ -70,10 +70,10 @@ modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu2
 | transformers | >=4.33       | 4.51 ||
 | modelscope | >=1.19       |  ||
 | peft | >=0.11,<0.16 | ||
-| trl | >=0.13,<0.17 | 0.16 |RLHF|
+| trl | >=0.13,<0.18 | 0.17 |RLHF|
 | deepspeed | >=0.14       | 0.14.5 |训练|
-| vllm | >=0.5.1      | 0.7.3/0.8.4 |推理/部署/评测|
-| lmdeploy | >=0.5        | 0.7.2.post1 |推理/部署/评测|
+| vllm | >=0.5.1      | 0.7.3/0.8 |推理/部署/评测|
+| lmdeploy | >=0.5        | 0.8 |推理/部署/评测|
 | evalscope | >=0.11       | |评测|
 
 更多可选依赖可以参考[这里](https://github.com/modelscope/ms-swift/blob/main/requirements/install_all.sh)。

diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -44,6 +44,7 @@
 - 🔥split_dataset_ratio: 不指定val_dataset时如何拆分训练集和验证集，默认为0.01。若不需要切分验证集，设置为0即可
 - data_seed: 数据集随机种子，默认为42
 - 🔥dataset_num_proc: 数据集预处理的进程数，默认为1
+- 🔥load_from_cache_file: 是否从缓存中加载数据集，默认为True
 - dataset_shuffle: 是否对dataset进行随机操作。默认为True。
   - 注意：CPT/SFT的随机包括两个部分：数据集的随机，由`dataset_shuffle`控制；train_dataloader中的随机，由`train_dataloader_shuffle`控制。
 - val_dataset_shuffle: 是否对val_dataset进行随机操作。默认为False。
@@ -52,7 +53,6 @@
 - interleave_prob: 默认值为 None。在组合多个数据集时，默认使用 
8000
`concatenate_datasets` 函数；如果设置了该参数，则会使用 `interleave_datasets` 函数。该参数通常用于流式数据集的组合，并会作为参数传入 `interleave_datasets` 函数中
 - stopping_strategy: 可选为"first_exhausted", "all_exhausted"，默认为"first_exhausted"。传入interleave_datasets函数中
 - shuffle_buffer_size: 该参数用于指定流式数据集的随机buffer大小，默认为1000
-- enable_cache: 数据集预处理使用cache，默认False
 - download_mode: 数据集下载模式，包含`reuse_dataset_if_exists`和`force_redownload`，默认为reuse_dataset_if_exists
 - columns: 用于对数据集进行列映射，使数据集满足AutoPreprocessor可以处理的样式，具体查看[这里](../Customization/自定义数据集.md)。你可以传入json字符串，例如：`'{"text1": "query", "text2": "response"}'`，默认为None。
 - strict: 如果为True，则数据集只要某行有问题直接抛错，否则会丢弃出错数据样本。默认False

diff --git a/docs/source_en/GetStarted/SWIFT-installation.md b/docs/source_en/GetStarted/SWIFT-installation.md
@@ -71,10 +71,10 @@ More images can be found [here](https://modelscope.cn/docs/intro/environment-set
 | transformers | >=4.33       | 4.51      |                                           |
 | modelscope   | >=1.19       |             |                                           |
 | peft         | >=0.11,<0.16 |             |                                           |
-| trl          | >=0.13,<0.17 | 0.16      | RLHF                                      |
+| trl          | >=0.13,<0.18 | 0.17      | RLHF                                      |
 | deepspeed    | >=0.14       | 0.14.5 | Training                                  |
-| vllm         | >=0.5.1      | 0.7.3/0.8.4       | Inference/Deployment/Evaluation           |
-| lmdeploy     | >=0.5        | 0.7.2.post1       | Inference/Deployment/Evaluation           |
+| vllm         | >=0.5.1      | 0.7.3/0.8       | Inference/Deployment/Evaluation           |
+| lmdeploy     | >=0.5        | 0.8       | Inference/Deployment/Evaluation           |
 | evalscope | >=0.11       | | Evaluation |
 
 For more optional dependencies, you can refer to [here](https://github.com/modelscope/ms-swift/blob/main/requirements/install_all.sh).

diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -45,6 +45,7 @@ Hints:
 - 🔥split_dataset_ratio: Ratio for splitting the training set and validation set when val_dataset is not specified, default is 0.01. Set to 0 if no validation set split is needed.
 - data_seed: Random seed for the dataset, default is 42.
 - 🔥dataset_num_proc: Number of processes for dataset preprocessing, default is 1.
+- 🔥load_from_cache_file: Whether to load the dataset from the cache, default is True.
 - dataset_shuffle: Whether to shuffle the dataset. Defaults to True.
   - Note: The shuffling in CPT/SFT consists of two parts: dataset shuffling, controlled by `dataset_shuffle`; and shuffling in the train_dataloader, controlled by `train_dataloader_shuffle`.
 - val_dataset_shuffle: Whether to perform shuffling on the val_dataset. Default is False.
@@ -53,7 +54,6 @@ Hints:
 - interleave_prob: Defaults to None. When combining multiple datasets, the `concatenate_datasets` function is used by default. If this parameter is set, the `interleave_datasets` function will be used instead. This parameter is typically used when combining streaming datasets and is passed to the `interleave_datasets` function.
 - stopping_strategy: Can be either "first_exhausted" or "all_exhausted", with the default being "first_exhausted". This parameter is passed to the `interleave_datasets` function.
 - shuffle_buffer_size: This parameter is used to specify the shuffle buffer size for streaming datasets. Defaults to 1000.
-- enable_cache: Use cache for dataset preprocessing, default is False.
 - download_mode: Dataset download mode, including `reuse_dataset_if_exists` and `force_redownload`, default is reuse_dataset_if_exists.
 - columns: Used for column mapping of the dataset to ensure that the dataset conforms to the format that AutoPreprocessor can handle. For more details, see [here](../Customization/Custom-dataset.md). You can pass in a JSON string, for example: `'{"text1": "query", "text2": "response"}'`, with the default being None.
 - strict: If set to True, any row with an issue in the dataset will throw an error immediately, otherwise, erroneous data samples will be discarded. Default is False.

diff --git a/examples/train/lora_sft.sh b/examples/train/lora_sft.sh
@@ -1,4 +1,5 @@
 # 22GB
+# qwen3: https://github.com/modelscope/ms-swift/blob/main/examples/train/think_model/qwen3_demo1.sh
 CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \

diff --git a/requirements/install_all.sh b/requirements/install_all.sh
@@ -4,7 +4,7 @@ pip install "vllm>=0.5.1" -U
 pip install "lmdeploy>=0.5" -U --no-d
F438
eps
 pip install autoawq -U --no-deps
 pip install auto_gptq optimum bitsandbytes -U
-pip install git+https://github.com/modelscope/ms-swift.git#egg=ms-swift[all]
+pip install git+https://github.com/modelscope/ms-swift.git
 pip install timm -U
 pip install deepspeed -U
 pip install qwen_vl_utils qwen_omni_utils decord librosa pyav icecream soundfile -U

diff --git a/swift/llm/argument/base_args/data_args.py b/swift/llm/argument/base_args/data_args.py
@@ -22,7 +22,6 @@ class DataArguments:
         data_seed (Optional[int]): Seed for dataset shuffling. Default is None.
         dataset_num_proc (int): Number of processes to use for data loading and preprocessing. Default is 1.
         streaming (bool): Flag to enable streaming of datasets. Default is False.
-        enable_cache (bool): Flag to load dataset from cache file. Default is False.
         download_mode (Literal): Mode for downloading datasets. Default is 'reuse_dataset_if_exists'.
         columns: Used for manual column mapping of datasets.
         model_name (List[str]): List containing Chinese and English names of the model. Default is [None, None].
@@ -39,14 +38,14 @@ class DataArguments:
 
     data_seed: Optional[int] = None
     dataset_num_proc: int = 1
+    load_from_cache_file: bool = True
     dataset_shuffle: bool = True
     val_dataset_shuffle: bool = False
     streaming: bool = False
     interleave_prob: Optional[List[float]] = None
     stopping_strategy: Literal['first_exhausted', 'all_exhausted'] = 'first_exhausted'
     shuffle_buffer_size: int = 1000
 
-    enable_cache: bool = False
     download_mode: Literal['force_redownload', 'reuse_dataset_if_exists'] = 'reuse_dataset_if_exists'
     columns: Optional[Union[dict, str]] = None
     strict: bool = False
@@ -69,8 +68,6 @@ def __post_init__(self):
         if self.data_seed is None:
             self.data_seed = self.seed
         self.columns = self.parse_to_dict(self.columns)
-        if self.enable_cache:
-            enable_caching()
         if len(self.val_dataset) > 0 or self.streaming:
             self.split_dataset_ratio = 0.
             if len(self.val_dataset) > 0:
@@ -84,6 +81,7 @@ def get_dataset_kwargs(self):
         return {
             'seed': self.data_seed,
             'num_proc': self.dataset_num_proc,
+            'load_from_cache_file': self.load_from_cache_file,
             'streaming': self.streaming,
             'interleave_prob': self.interleave_prob,
             'stopping_strategy': self.stopping_strategy,

diff --git a/swift/llm/dataset/__init__.py b/swift/llm/dataset/__init__.py
@@ -1,9 +1,9 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import inspect
+
 import datasets.fingerprint
 from datasets import Dataset as HfDataset
-from datasets import disable_caching
 
-from swift.utils.torch_utils import _find_local_mac
 from ..utils import get_temporary_cache_files_directory
 from . import dataset
 from .loader import DATASET_TYPE, load_dataset
@@ -14,23 +14,22 @@
 from .utils import (EncodePreprocessor, GetLengthPreprocessor, IterablePackingDataset, LazyLLMDataset, PackingDataset,
                     sample_dataset)
 
-_update_fingerprint = datasets.fingerprint.update_fingerprint
-_get_temporary_cache_files_directory = datasets.fingerprint.get_temporary_cache_files_directory
+update_fingerprint_origin = datasets.fingerprint.update_fingerprint
 
 
-def _update_fingerprint_mac(*args, **kwargs):
-    # Prevent different nodes use the same location in unique shared disk
-    mac = _find_local_mac().replace(':', '')
-    fp = _update_fingerprint(*args, **kwargs)
-    fp += '-' + mac
-    if len(fp) > 64:
-        fp = fp[:64]
-    return fp
+def update_fingerprint(fingerprint, transform, transform_args):
+    if 'function' in transform_args:
+        # Calculate the hash using the source code.
+        if hasattr(transform_args['function'], '__self__'):
+            function = inspect.getsource(transform_args['function'].__self__.__class__)
+        else:
+            function = inspect.getsource(transform_args['function'])
+        transform_args['function'] = function
+    return update_fingerprint_origin(fingerprint, transform, transform_args)
 
 
-datasets.fingerprint.update_fingerprint = _update_fingerprint_mac
-datasets.arrow_dataset.update_fingerprint = _update_fingerprint_mac
+datasets.fingerprint.update_fingerprint = update_fingerprint
+datasets.arrow_dataset.update_fingerprint = update_fingerprint
 datasets.fingerprint.get_temporary_cache_files_directory = get_temporary_cache_files_directory
 datasets.arrow_dataset.get_temporary_cache_files_directory = get_temporary_cache_files_directory
 register_dataset_info()
-disable_caching()
diff --git a/swift/llm/dataset/loader.py b/swift/llm/dataset/loader.py
@@ -198,6 +198,7 @@ def _load_dataset_path(
         dataset_meta: DatasetMeta,
         *,
         num_proc: int = 1,
+        load_from_cache_file: bool = True,
         strict: bool = False,
         streaming: bool = False,
         columns: Optional[Dict[str, str]] = None,
@@ -211,7 +212,8 @@ def _load_dataset_path(
         dataset = hf_load_dataset(file_type, data_files=dataset_path, **kwargs)
         if columns:
             dataset = RowPreprocessor.safe_rename_columns(dataset, columns)
-        dataset = dataset_meta.preprocess_func(dataset, num_proc=num_proc, strict=strict)
+        dataset = dataset_meta.preprocess_func(
+            dataset, num_proc=num_proc, load_from_cache_file=load_from_cache_file, strict=strict)
         if remove_unused_columns:
             dataset = RowPreprocessor.remove_useless_columns(dataset)
         return dataset
@@ -222,6 +224,7 @@ def _load_repo_dataset(
         subset: SubsetDataset,
         *,
         num_proc: int = 1,
+        load_from_cache_file: bool = True,
         streaming: bool = False,
         use_hf: Optional[bool] = None,
         hub_token: Optional[str] = None,
@@ -282,7 +285,8 @@ def _load_repo_dataset(
                     dataset = dataset.to_iterable_dataset()
             if columns:
                 dataset = RowPreprocessor.safe_rename_columns(dataset, columns)
-            dataset = subset.preprocess_func(dataset, num_proc=num_proc, strict=strict)
+            dataset = subset.preprocess_func(
+                dataset, num_proc=num_proc, load_from_cache_file=load_from_cache_file, strict=strict)
             if remove_unused_columns:
                 dataset = RowPreprocessor.remove_useless_columns(dataset)
             datasets.append(dataset)
@@ -373,6 +377,7 @@ def load(
         dataset_meta: Optional[DatasetMeta] = None,
         *,
         num_proc: int = 1,
+        load_from_cache_file: bool = True,
         streaming: bool = False,
         use_hf: Optional[bool] = None,
         hub_token: Optional[str] = None,
@@ -386,6 +391,7 @@ def load(
                 dataset_syntax.dataset,
                 dataset_meta=dataset_meta,
                 num_proc=num_proc,
+                load_from_cache_file=load_from_cache_file,
                 strict=strict,
                 streaming=streaming,
                 columns=columns,
@@ -402,6 +408,7 @@ def load(
                     use_hf=use_hf,
                     hub_token=hub_token,
                     num_proc=num_proc,
+                    load_from_cache_file=load_from_cache_file,
                     strict=strict,
                     revision=revision,
                     streaming=streaming,
@@ -435,6 +442,7 @@ def load_dataset(
     split_dataset_ratio: float = 0.,
     seed: Union[int, np.random.RandomState, None] = None,
     num_proc: int = 1,
+    load_from_cache_file: bool = True,
     shuffle: bool = False,
     streaming: bool = False,
     interleave_prob: Optional[List[float]] = None,
@@ -444,7 +452,7 @@ def load_dataset(
     hub_token: Optional[str] = None,
     strict: bool = False,
     download_mode: Literal['force_redownload', 'reuse_dataset_if_exists'] = 'reuse_dataset_if_exists',
-    columns: Optional[Dict[str, str]] = None,
+    columns: Optional[Dict[str, str]] = None,  # columns_mapping
     remove_unused_columns: bool = True,
     # self-cognition
     model_name: Union[Tuple[str, str], List[str], None] = None,  # zh, en
@@ -482,6 +490,7 @@ def load_dataset(
     val_datasets = []
     load_kwargs = {
         'num_proc': num_proc,
+        'load_from_cache_file': load_from_cache_file,
         'strict': strict,
         'download_mode': download_mode,
         'columns': columns,

diff --git a/swift/llm/dataset/preprocessor/core.py b/swift/llm/dataset/preprocessor/core.py
@@ -12,7 +12,7 @@
 from datasets import Sequence, Value
 
 from swift.llm import history_to_messages
-from swift.utils import get_logger
+from swift.utils import get_logger, is_dist, is_master, safe_ddp_context
 
 DATASET_TYPE = Union[HfDataset, HfIterableDataset]
 
@@ -280,6 +280,7 @@ def __call__(
         dataset: DATASET_TYPE,
         *,
         num_proc: int = 1,
+        load_from_cache_file: bool = True,
         strict: bool = False,
         batch_size: Optional[int] = None,
     ) -> DATASET_TYPE:
@@ -291,17 +292,23 @@ def __call__(
 
         map_kwargs = {'batched': True, 'batch_size': batch_size}
         if isinstance(dataset, HfDataset):
-            map_kwargs['num_proc'] = num_proc
+            if not load_from_cache_file and is_dist() and not is_master():
+                load_from_cache_file = True
+            map_kwargs.update({
+                'num_proc': num_proc,
+                'load_from_cache_file': load_from_cache_file,
+            })
         # compat GRPO: The solution field will be retained.
         dataset = RowPreprocessor.get_features_dataset(dataset)
         if 'solution' in dataset.features:
-            dataset = dataset.map(lambda x: {'__#solution': x['solution']}, **map_kwargs)
+            with safe_ddp_context(None, True):
+                dataset = dataset.map(lambda x: {'__#solution': x['solution']}, **map_kwargs)
         dataset = self._rename_columns(dataset)
         dataset = self.prepare_dataset(dataset)
         dataset = self._cast_pil_image(dataset)
 
         ignore_max_length_error = True if isinstance(dataset, HfDataset) and num_proc > 1 else False
-        with self._patch_arrow_writer():
+        with self._patch_arrow_writer(), safe_ddp_context(None, True):
             try:
                 dataset_mapped = dataset.map(
                     self.batched_preprocess,
@@ -514,8 +521,9 @@ def __call__(
         dataset: DATASET_TYPE,
         *,
         num_proc: int = 1,
+        load_from_cache_file: bool = True,
         strict: bool = False,
     ) -> DATASET_TYPE:
         dataset = RowPreprocessor.safe_rename_columns(dataset, self.columns)
         preprocessor = self._get_preprocessor(dataset)
-        return preprocessor(dataset, num_proc=num_proc, strict=strict)
+        return preprocessor(dataset, num_proc=num_proc, load_from_cache_file=load_from_cache_file, strict=strict)