当前位置: 首页 > news >正文

【Moonshine Onnx版本 语音识别】

## 安装环境

pip install onnxruntime numpy tokenizers librosa modelscope huggingface-hub

## 下载模型

huggingface

!huggingface-cli download UsefulSensors/moonshine --allow_patterns 'onnx/base/*.onnx' --local-dir ./models/


# 下载tokenizer.json
!wget https://github.com/usefulsensors/moonshine/blob/main/moonshine/assets/tokenizer.json -P './models/onnx/base/'

modelscope

!modelscope download --model manyeyes/moonshine-base-en-onnx --local_dir ./models/

## 运行

import os
import wave
import numpy as np
import tokenizers
import onnxruntimeclass MoonshineOnnxModel:def __init__(self, models_dir):preprocess, encode, uncached_decode, cached_decode = [f"{models_dir}/{x}.onnx"for x in ["preprocess", "encode", "uncached_decode", "cached_decode"]]self.preprocess = onnxruntime.InferenceSession(preprocess)self.encode = onnxruntime.InferenceSession(encode)self.uncached_decode = onnxruntime.InferenceSession(uncached_decode)self.cached_decode = onnxruntime.InferenceSession(cached_decode)self.tokenizer = tokenizers.Tokenizer.from_file(os.path.join(models_dir, "tokenizer.json"))print('Successfully Load Model.')def _generate(self, audio, max_len=None):"audio has to be a numpy array of shape [1, num_audio_samples]"if max_len is None:# max 6 tokens per second of audiomax_len = int((audio.shape[-1] / 16_000) * 6)preprocessed = self.preprocess.run([], dict(args_0=audio))[0]seq_len = [preprocessed.shape[-2]]context = self.encode.run([], dict(args_0=preprocessed, args_1=seq_len))[0]inputs = [[1]]seq_len = [1]tokens = [1]logits, *cache = self.uncached_decode.run([], dict(args_0=inputs, args_1=context, args_2=seq_len))for i in range(max_len):next_token = logits.squeeze().argmax()tokens.extend([next_token])if next_token == 2:breakseq_len[0] += 1inputs = [[next_token]]logits, *cache = self.cached_decode.run([],dict(args_0=inputs,args_1=context,args_2=seq_len,**{f"args_{i+3}": x for i, x in enumerate(cache)},),)return [tokens]def generate(self, audio_paths: list[str] | str, max_len=None):if isinstance(audio_paths, str):audio_paths = [audio_paths]audios = []for audio_path in audio_paths:with wave.open(audio_path) as f:params = f.getparams()assert (params.nchannels == 1and params.framerate == 16_000and params.sampwidth == 2), f"wave file should have 1 channel, 16KHz, and int16"audio = f.readframes(params.nframes)audio = np.frombuffer(audio, np.int16) / 32768.0audio = audio.astype(np.float32)[None, ...]audios.append(audio)audios = np.concatenate(audios, axis=0)tokens = self._generate(audios, max_len)texts = self.tokenizer.decode_batch(tokens)return textsif __name__ == "__main__":model_dir = f"models/onnx/base/"client = MoonshineOnnxModel(model_dir)audio_path = "beckett.wav"text = client.generate(audio_path)print(text)


http://www.mrgr.cn/news/65584.html

相关文章:

  • OceanBase V4.3.3,首个面向实时分析场景的GA版本发布
  • Oracle OCP认证考试考点详解082系列11
  • Django中间件应该怎么使用
  • Selenium的下载及chrome环境搭建
  • Servlet和JSP入门详解
  • 天生倔强脸的白纸新人,徐畅演艺生涯初舞台获得肯定!
  • Linux之crontab使用
  • JavaEE-多线程初阶(3)
  • Android笔记(三十三):封装设备性能级别判断工具——低端机还是高端机
  • MySQL表的增删改查(CRUD2)
  • 栈和队列(三)
  • 新手入门c++,咳咳,(9),咳咳
  • 你从未见过的小主机,买也买不到的科技尤物,只让你眼馋
  • 考公VS考研,在职上班族拼哪个性价比高?
  • Ubuntu开启FTP与SSH服务
  • JS中面向对象
  • 源码阅读心得---如何从零开始阅读一个框架的源码
  • (七)Python运算符和优先级
  • 心觉:人每日60000念头,如何让你的时间精力只专注于核心目标?
  • R 语言数据导入与导出
  • 贝尔不等式的验证
  • “代码世界的必修课:Git完整指南“(3)
  • SSD201 SSD202D SigmaStar智能高清显示芯片
  • 「Mac畅玩鸿蒙与硬件15」鸿蒙UI组件篇5 - Slider 和 Progress 组件
  • 北京美信时代渠道代理:运维后期维保服务策略
  • 跳表原理笔记