1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302
|
""" 批量对话内容提取脚本 用于处理千恋万花游戏脚本中的所有JSON文件,提取中文对话并转换为指定格式 """
import json import re import os import sys import glob from pathlib import Path
def extract_dialogues_from_json(json_file_path): """ 从JSON文件中提取对话内容 Args: json_file_path (str): JSON文件路径 Returns: list: 提取的对话列表,格式为 [角色名, 简体中文对话, voice字符串] """ dialogues = [] try: encodings = ['utf-8', 'utf-8-sig', 'gbk', 'shift_jis', 'cp932'] content = None for encoding in encodings: try: with open(json_file_path, 'r', encoding=encoding) as file: content = file.read() print(f"成功使用 {encoding} 编码读取文件") break except UnicodeDecodeError: continue if content is None: print("无法使用任何编码读取文件") return [] try: data = json.loads(content) print("成功解析JSON数据") except json.JSONDecodeError as e: print(f"JSON解析错误: {e}") return extract_dialogues_with_regex(content) def find_dialogues(obj, path=[]): if isinstance(obj, list): for i, item in enumerate(obj): if (isinstance(item, list) and len(item) >= 4 and item[1] is None and isinstance(item[2], list) and len(item[2]) >= 3): if item[0] is None: character_name = "" elif isinstance(item[0], str): character_name = item[0] else: character_name = "" dialogue_array = item[2] if len(dialogue_array) > 2 and isinstance(dialogue_array[2], list) and len(dialogue_array[2]) >= 2: chinese_dialogue = dialogue_array[2][1] chinese_character_name = "" if len(dialogue_array) > 2 and isinstance(dialogue_array[2], list) and len(dialogue_array[2]) >= 1: chinese_character_name = dialogue_array[2][0] if not chinese_character_name and character_name: chinese_character_name = character_name voice_string = "" if len(item) > 3 and isinstance(item[3], dict) and "voice" in item[3]: voice_string = item[3]["voice"] elif len(item) > 3 and isinstance(item[3], list) and len(item[3]) > 0 and isinstance(item[3][0], dict) and "voice" in item[3][0]: voice_string = item[3][0]["voice"] dialogues.append({ 'character': chinese_character_name, 'dialogue': chinese_dialogue, 'voice': voice_string }) find_dialogues(item, path + [i]) elif isinstance(obj, dict): for key, value in obj.items(): find_dialogues(value, path + [key]) find_dialogues(data) print(f"找到 {len(dialogues)} 个对话条目") except Exception as e: print(f"处理文件时出错: {e}") import traceback traceback.print_exc() return [] return dialogues
def extract_dialogues_with_regex(content): """使用正则表达式提取对话内容(备用方法)""" dialogues = [] pattern = r'\[\s*(?:"([^"]+)"|null)\s*,\s*null\s*,\s*\[(.*?)\]\s*,\s*\{(.*?)\}.*?\]' matches = re.findall(pattern, content, re.DOTALL) print(f"使用正则表达式找到 {len(matches)} 个匹配项") for match in matches: character_name = match[0] if match[0] else "" dialogue_array_str = match[1] voice_info = match[2] simplified_pattern = r'"([^"]+)"\s*,\s*"([^"]+)' dialogue_matches = re.findall(simplified_pattern, dialogue_array_str) if len(dialogue_matches) >= 3: simplified_chinese = dialogue_matches[2][1] voice_match = re.search(r'"voice"\s*:\s*"([^"]+)"', voice_info) voice_string = voice_match.group(1) if voice_match else "" dialogues.append({ 'character': character_name, 'dialogue': simplified_chinese, 'voice': voice_string }) return dialogues
def convert_to_target_format(dialogues): """ 将提取的对话转换为目标格式 Args: dialogues (list): 提取的对话列表 Returns: list: 转换后的对话列表 """ formatted_dialogues = [] for dialogue in dialogues: character = dialogue['character'] text = dialogue['dialogue'] voice = dialogue['voice'] if character: formatted = f"{character}:{text}" else: formatted = text if voice: formatted += f" [{voice}.ogg]" formatted_dialogues.append(formatted) return formatted_dialogues
def process_json_file(input_file_path, output_file_path=None): """ 处理单个JSON文件 Args: input_file_path (str): 输入JSON文件路径 output_file_path (str, optional): 输出文件路径,如果为None则打印到控制台 """ print(f"正在处理文件: {input_file_path}") dialogues = extract_dialogues_from_json(input_file_path) if not dialogues: print("未找到符合条件的对话内容") return 0 print(f"找到 {len(dialogues)} 个对话") formatted_dialogues = convert_to_target_format(dialogues) if output_file_path: with open(output_file_path, 'w', encoding='utf-8') as f: for dialogue in formatted_dialogues: f.write(dialogue + '\n') print(f"结果已保存到: {output_file_path}") else: print("\n转换后的对话内容:") print("=" * 50) for i, dialogue in enumerate(formatted_dialogues, 1): print(f"{i}. {dialogue}") return len(dialogues)
def batch_process_all_json_files(input_directory, output_directory): """ 批量处理所有JSON文件 Args: input_directory (str): 输入目录路径 output_directory (str): 输出目录路径 """ os.makedirs(output_directory, exist_ok=True) json_files = [] json_files.extend(glob.glob(os.path.join(input_directory, "**", "*.json"), recursive=True)) json_files.extend(glob.glob(os.path.join(input_directory, "*.json"))) json_files = list(set(json_files)) print(f"找到 {len(json_files)} 个JSON文件") total_dialogues = 0 processed_files = 0 for json_file in json_files: try: file_name = os.path.basename(json_file) output_file_name = file_name.replace('.json', '_dialogues.txt') output_file_path = os.path.join(output_directory, output_file_name) dialogue_count = process_json_file(json_file, output_file_path) total_dialogues += dialogue_count processed_files += 1 print(f"已完成 {processed_files}/{len(json_files)} 个文件") print("-" * 50) except Exception as e: print(f"处理文件 {json_file} 时出错: {e}") continue print(f"\n批量处理完成!") print(f"成功处理 {processed_files}/{len(json_files)} 个文件") print(f"总共提取 {total_dialogues} 个对话条目") summary_file = os.path.join(output_directory, "processing_summary.txt") with open(summary_file, 'w', encoding='utf-8') as f: f.write("批量对话处理汇总报告\n") f.write("=" * 50 + "\n") f.write(f"处理时间: {os.path.getctime(__file__)}\n") f.write(f"输入目录: {input_directory}\n") f.write(f"输出目录: {output_directory}\n") f.write(f"找到的JSON文件总数: {len(json_files)}\n") f.write(f"成功处理的文件数: {processed_files}\n") f.write(f"总共提取的对话条目数: {total_dialogues}\n") f.write("\n处理的文件列表:\n") for json_file in json_files: file_name = os.path.basename(json_file) output_file_name = file_name.replace('.json', '_dialogues.txt') f.write(f"- {file_name} -> {output_file_name}\n") print(f"汇总报告已保存到: {summary_file}")
def main(): """主函数""" input_directory = r"C:\Users\lgf\Desktop\galageimu\千恋万花" output_directory = r"C:\Users\lgf\Desktop\galageimu\千恋万花\aaa" print("开始批量处理所有JSON文件...") print(f"输入目录: {input_directory}") print(f"输出目录: {output_directory}") print("=" * 60) batch_process_all_json_files(input_directory, output_directory)
if __name__ == "__main__": main()
|