feat: change waveform

2026-05-07 11:29:21 +06:00
commit d31233a79a
21 changed files with 5330 additions and 0 deletions
--- a/src/handle/export_vocab.py
+++ b/src/handle/export_vocab.py
@@ -0,0 +1,92 @@
+import json
+from pathlib import Path
+from typing import Counter
+import pandas as pd
+from tqdm import tqdm
+from text_handle import process_text
+
+workspace_dir = Path(__file__).parent.parent.parent
+
+data_dir = Path("/home/blacksheep/projekts/study")
+input_files = [
+    data_dir / "data/multilingual/hanziDB_translated-simplified.jsonl",
+    data_dir / "data/multilingual/hanziDB_translated_validationset-simplified.jsonl",
+    data_dir / "data/multilingual/tatoeba-tr-en-ug-uz-kz-zh-simplified.jsonl",
+    data_dir / "data/multilingual/export_csv_database.jsonl",
+    data_dir / "data/multilingual/tatoeba_sentences.jsonl",
+]
+output_file =  workspace_dir / "config/asr_vocab_1.json"
+syllabizes = []
+
+# langs = ["uig_Arab"]
+# for input_file in input_files:
+#     with open(input_file, 'r', encoding='utf-8') as f:
+#         total_lines = sum(1 for _ in f)
+#         f.seek(0)
+#         for line in tqdm(f, total=total_lines, desc=f"Processing lines {input_file}"):
+#             data: dict[str, str] = json.loads(line)
+#             for lang in langs:
+#                 if lang in data:
+#                     syllabizes.extend(export_syllabize(data[lang]))
+
+# print(f'data_dir syllabize len: {len(syllabizes):,}, set: {len(set(syllabizes)):,}')
+
+
+tsv_files = [
+    # workspace_dir / "data/ug/test.tsv",
+    # workspace_dir / "data/ug/invalidated.tsv",
+    # workspace_dir / "data/ug/train.tsv",
+    # workspace_dir / "data/ug/validated.tsv",
+    # workspace_dir / "data/ug/reported.tsv",
+    # workspace_dir / "data/ug/dev.tsv",
+    # workspace_dir / "data/ug/other.tsv",
+    # workspace_dir / ".data/ug/invalidated.tsv",
+    workspace_dir / ".data/ug/train.tsv",
+    # workspace_dir / ".data/ug/clip_durations.tsv", # not sentence
+    # workspace_dir / ".data/ug/test.tsv",
+    # workspace_dir / ".data/ug/validated_sentences.tsv",
+    # workspace_dir / ".data/ug/other.tsv",
+    # workspace_dir / ".data/ug/validated.tsv",
+    # workspace_dir / ".data/ug/dev.tsv",
+    # workspace_dir / ".data/ug/unvalidated_sentences.tsv",
+    # workspace_dir / ".data/ug/reported.tsv" # Lacking sentence
+]
+
+for tsv_file in tsv_files:
+    data = pd.read_csv(tsv_file, sep='\t')
+    # 带进度条处理每行数据
+    for index, row in tqdm(data.iterrows(), total=len(data), desc=f"Processing {tsv_file}"):
+        syllabizes.extend(process_text(row['sentence'].strip()))
+
+
+# 统计所有音节出现次数
+syllable_counter = Counter(syllabizes)
+# 过滤出出现100次以上的音节
+freq_100_plus = {k: v for k, v in syllable_counter.items() if v >= 150}
+freq_100_minus = {k: v for k, v in syllable_counter.items() if v <= 100}
+
+# 保存100次以上的音节列表（只有音节，排序）
+vocab = sorted(list(freq_100_plus.keys()), key=len)
+with open(output_file, 'w', encoding='utf-8') as f:
+    json.dump(vocab, f, ensure_ascii=False, indent=2)
+
+# # # 保存100次以上的音节及次数
+sorted_freq_100_plus = dict(sorted(freq_100_plus.items(), key=lambda x: x[1], reverse=True))
+with open(workspace_dir / 'config/syllables_freq_100_plus.json', 'w', encoding='utf-8') as f:
+    json.dump(sorted_freq_100_plus, f, ensure_ascii=False, indent=2)
+
+# # 统计信息
+print(f"总音节数: {len(syllabizes):,}")
+print(f"唯一音节数: {len(syllable_counter):,}")
+print(f"出现100次以上的音节数: {len(freq_100_plus):,}")
+print(f"出现100次以下的音节数: {len(freq_100_minus):,}")
+
+# 区间统计（不累积）
+print("\n=== 音节使用次数统计（区间） ===")
+for low in range(0, 100, 10):
+    high = low + 9
+    count = sum(1 for freq in syllable_counter.values() if low <= freq <= high)
+    print(f"出现 {low}-{high} 次: {count:,} 个音节")
+# 100次以上
+count_100plus = sum(1 for freq in syllable_counter.values() if freq >= 100)
+print(f"出现 100+ 次: {count_100plus} 个音节")