import json from pathlib import Path from typing import Counter import pandas as pd from tqdm import tqdm from text_handle import process_text workspace_dir = Path(__file__).parent.parent.parent data_dir = Path("/home/blacksheep/projekts/study") input_files = [ data_dir / "data/multilingual/hanziDB_translated-simplified.jsonl", data_dir / "data/multilingual/hanziDB_translated_validationset-simplified.jsonl", data_dir / "data/multilingual/tatoeba-tr-en-ug-uz-kz-zh-simplified.jsonl", data_dir / "data/multilingual/export_csv_database.jsonl", data_dir / "data/multilingual/tatoeba_sentences.jsonl", ] output_file = workspace_dir / "config/asr_vocab_1.json" syllabizes = [] # langs = ["uig_Arab"] # for input_file in input_files: # with open(input_file, 'r', encoding='utf-8') as f: # total_lines = sum(1 for _ in f) # f.seek(0) # for line in tqdm(f, total=total_lines, desc=f"Processing lines {input_file}"): # data: dict[str, str] = json.loads(line) # for lang in langs: # if lang in data: # syllabizes.extend(export_syllabize(data[lang])) # print(f'data_dir syllabize len: {len(syllabizes):,}, set: {len(set(syllabizes)):,}') tsv_files = [ # workspace_dir / "data/ug/test.tsv", # workspace_dir / "data/ug/invalidated.tsv", # workspace_dir / "data/ug/train.tsv", # workspace_dir / "data/ug/validated.tsv", # workspace_dir / "data/ug/reported.tsv", # workspace_dir / "data/ug/dev.tsv", # workspace_dir / "data/ug/other.tsv", # workspace_dir / ".data/ug/invalidated.tsv", workspace_dir / ".data/ug/train.tsv", # workspace_dir / ".data/ug/clip_durations.tsv", # not sentence # workspace_dir / ".data/ug/test.tsv", # workspace_dir / ".data/ug/validated_sentences.tsv", # workspace_dir / ".data/ug/other.tsv", # workspace_dir / ".data/ug/validated.tsv", # workspace_dir / ".data/ug/dev.tsv", # workspace_dir / ".data/ug/unvalidated_sentences.tsv", # workspace_dir / ".data/ug/reported.tsv" # Lacking sentence ] for tsv_file in tsv_files: data = pd.read_csv(tsv_file, sep='\t') # 带进度条处理每行数据 for index, row in tqdm(data.iterrows(), total=len(data), desc=f"Processing {tsv_file}"): syllabizes.extend(process_text(row['sentence'].strip())) # 统计所有音节出现次数 syllable_counter = Counter(syllabizes) # 过滤出出现100次以上的音节 freq_100_plus = {k: v for k, v in syllable_counter.items() if v >= 150} freq_100_minus = {k: v for k, v in syllable_counter.items() if v <= 100} # 保存100次以上的音节列表(只有音节,排序) vocab = sorted(list(freq_100_plus.keys()), key=len) with open(output_file, 'w', encoding='utf-8') as f: json.dump(vocab, f, ensure_ascii=False, indent=2) # # # 保存100次以上的音节及次数 sorted_freq_100_plus = dict(sorted(freq_100_plus.items(), key=lambda x: x[1], reverse=True)) with open(workspace_dir / 'config/syllables_freq_100_plus.json', 'w', encoding='utf-8') as f: json.dump(sorted_freq_100_plus, f, ensure_ascii=False, indent=2) # # 统计信息 print(f"总音节数: {len(syllabizes):,}") print(f"唯一音节数: {len(syllable_counter):,}") print(f"出现100次以上的音节数: {len(freq_100_plus):,}") print(f"出现100次以下的音节数: {len(freq_100_minus):,}") # 区间统计(不累积) print("\n=== 音节使用次数统计(区间) ===") for low in range(0, 100, 10): high = low + 9 count = sum(1 for freq in syllable_counter.values() if low <= freq <= high) print(f"出现 {low}-{high} 次: {count:,} 个音节") # 100次以上 count_100plus = sum(1 for freq in syllable_counter.values() if freq >= 100) print(f"出现 100+ 次: {count_100plus} 个音节")