diff --git a/README.md b/README.md index f243cc9..5306d74 100644 --- a/README.md +++ b/README.md @@ -42,4 +42,21 @@ Pass `--compute-type ` to control the numeric precision used during infere | `float16` | Slightly better accuracy, higher VRAM usage. | | `int8` | CPU-friendly, lower quality. | -If you see a CUDA error about mismatched library versions at startup, use `setup-venv-local-build.sh` to build ctranslate2 against your system CUDA version rather than using the PyPI wheel. \ No newline at end of file +If you see a CUDA error about mismatched library versions at startup, use `setup-venv-local-build.sh` to build ctranslate2 against your system CUDA version rather than using the PyPI wheel. + + +## Language and translation + +By default the server auto-detects the spoken language and transcribes it as-is. + +| Argument | Default | Notes | +|----------|---------|-------| +| `--language ` | none (auto-detect) | Force a specific language, e.g. `--language en` or `--language sv`. Speeds up detection and avoids misidentification. | +| `--task transcribe` | default | Output text in the spoken language. | +| `--task translate` | | Translate speech to English regardless of source language. | + +> [!NOTE] +> The `.en` model variants (`base.en`, `small.en` etc.) are English-only and do not support `--task translate` or non-English `--language`. Use a multilingual model (`large-v3`, `medium`) for multilingual or translation use cases. + +> [!WARNING] +> Omitting `--language` with a multilingual model and English-only speech may cause occasional misdetection. Pass `--language en` to avoid this if you only speak English. \ No newline at end of file diff --git a/stt-server.py b/stt-server.py index e08d226..8cba120 100755 --- a/stt-server.py +++ b/stt-server.py @@ -120,6 +120,8 @@ parser = argparse.ArgumentParser() parser.add_argument('--model', default='base.en') parser.add_argument('--device', default='cuda') parser.add_argument('--compute-type', default='int8_float16') +parser.add_argument('--language', default=None, help='language code (e.g. en, sv) or None for auto-detect') +parser.add_argument('--task', default='transcribe', choices=['transcribe', 'translate'], help='transcribe keeps the source language; translate converts to English') parser.add_argument('--verbose', '-v', action='store_true') args = parser.parse_args() verbose = args.verbose @@ -195,7 +197,8 @@ def transcription_worker(): try: segments, _ = model.transcribe( samples, - language='en', + language=args.language, + task=args.task, word_timestamps=True, vad_filter=False, )