Files
audio_model/src/handle/export_vocab.py
2026-05-07 11:29:21 +06:00

92 lines
3.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
from pathlib import Path
from typing import Counter
import pandas as pd
from tqdm import tqdm
from text_handle import process_text
workspace_dir = Path(__file__).parent.parent.parent
data_dir = Path("/home/blacksheep/projekts/study")
input_files = [
data_dir / "data/multilingual/hanziDB_translated-simplified.jsonl",
data_dir / "data/multilingual/hanziDB_translated_validationset-simplified.jsonl",
data_dir / "data/multilingual/tatoeba-tr-en-ug-uz-kz-zh-simplified.jsonl",
data_dir / "data/multilingual/export_csv_database.jsonl",
data_dir / "data/multilingual/tatoeba_sentences.jsonl",
]
output_file = workspace_dir / "config/asr_vocab_1.json"
syllabizes = []
# langs = ["uig_Arab"]
# for input_file in input_files:
# with open(input_file, 'r', encoding='utf-8') as f:
# total_lines = sum(1 for _ in f)
# f.seek(0)
# for line in tqdm(f, total=total_lines, desc=f"Processing lines {input_file}"):
# data: dict[str, str] = json.loads(line)
# for lang in langs:
# if lang in data:
# syllabizes.extend(export_syllabize(data[lang]))
# print(f'data_dir syllabize len: {len(syllabizes):,}, set: {len(set(syllabizes)):,}')
tsv_files = [
# workspace_dir / "data/ug/test.tsv",
# workspace_dir / "data/ug/invalidated.tsv",
# workspace_dir / "data/ug/train.tsv",
# workspace_dir / "data/ug/validated.tsv",
# workspace_dir / "data/ug/reported.tsv",
# workspace_dir / "data/ug/dev.tsv",
# workspace_dir / "data/ug/other.tsv",
# workspace_dir / ".data/ug/invalidated.tsv",
workspace_dir / ".data/ug/train.tsv",
# workspace_dir / ".data/ug/clip_durations.tsv", # not sentence
# workspace_dir / ".data/ug/test.tsv",
# workspace_dir / ".data/ug/validated_sentences.tsv",
# workspace_dir / ".data/ug/other.tsv",
# workspace_dir / ".data/ug/validated.tsv",
# workspace_dir / ".data/ug/dev.tsv",
# workspace_dir / ".data/ug/unvalidated_sentences.tsv",
# workspace_dir / ".data/ug/reported.tsv" # Lacking sentence
]
for tsv_file in tsv_files:
data = pd.read_csv(tsv_file, sep='\t')
# 带进度条处理每行数据
for index, row in tqdm(data.iterrows(), total=len(data), desc=f"Processing {tsv_file}"):
syllabizes.extend(process_text(row['sentence'].strip()))
# 统计所有音节出现次数
syllable_counter = Counter(syllabizes)
# 过滤出出现100次以上的音节
freq_100_plus = {k: v for k, v in syllable_counter.items() if v >= 150}
freq_100_minus = {k: v for k, v in syllable_counter.items() if v <= 100}
# 保存100次以上的音节列表只有音节排序
vocab = sorted(list(freq_100_plus.keys()), key=len)
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(vocab, f, ensure_ascii=False, indent=2)
# # # 保存100次以上的音节及次数
sorted_freq_100_plus = dict(sorted(freq_100_plus.items(), key=lambda x: x[1], reverse=True))
with open(workspace_dir / 'config/syllables_freq_100_plus.json', 'w', encoding='utf-8') as f:
json.dump(sorted_freq_100_plus, f, ensure_ascii=False, indent=2)
# # 统计信息
print(f"总音节数: {len(syllabizes):,}")
print(f"唯一音节数: {len(syllable_counter):,}")
print(f"出现100次以上的音节数: {len(freq_100_plus):,}")
print(f"出现100次以下的音节数: {len(freq_100_minus):,}")
# 区间统计(不累积)
print("\n=== 音节使用次数统计(区间) ===")
for low in range(0, 100, 10):
high = low + 9
count = sum(1 for freq in syllable_counter.values() if low <= freq <= high)
print(f"出现 {low}-{high} 次: {count:,} 个音节")
# 100次以上
count_100plus = sum(1 for freq in syllable_counter.values() if freq >= 100)
print(f"出现 100+ 次: {count_100plus} 个音节")