The Pronunciation Task Force develops specifications for hypertext markup language (HTML) author control of text-to-speech (TTS) presentation.

Appendix A. SSML JSON Schema

The JSON schema defines the specific SSML functions, properties, and values recommended for implementation in this proposal.


{
"$schema": "http://json-schema.org/draft-07/schema#",
"$id": "http://ets-research.org/ia11ylab/ia/json/ssml-json-schema-w3cptf.json",
"title": "SSML as a single attribute for inclusion in HTML",
"description": "JSON structure representing each SSML element as a JSON object. The SSML properties are dervived 
from https://www.w3.org/TR/speech-synthesis11/. Several elements are excluded: mark, speak, p, w and the desc attribute.
Author: M. Hakkinen - ETS", "type": "object", "properties": { "say-as": { "description": "The unique identifier for a product", "type": "object", "properties": { "interpret-as": { "type": "string", "enum": ["date","time","telephone","characters","cardinal","ordinal"]}, "format": { "type": "string" }, "detail": {"type": "string"} } }, "phoneme": { "description": "The Phoneme Function", "type": "object", "properties": { "ph": { "type": "string"}, "alphabet": {"type": "string", "enum": ["ipa", "x-sampa"]}} }, "sub": { "description": "sub function", "type": "object", "properties": { "alias": {"type":"string"}} }, "voice":{"description": "voice function", "type":"object", "properties": { "gender": {"type":"string", "enum": ["female","male","neutral"]}, "age": {"type":"integer"}, "variant":{"type":"string"}, "name": {"type":"string"}, "languages": {"type":"string"} } }, "emphasis":{ "description": "speech emphasis level", "type":"object", "properties": { "level": {"type":"string", "enum": ["none","x-weak","weak","medium","strong","x-strong"]}, "time": {"type":"string", "pattern":"^(-?(0|[1-9]\\d*)?(\\.\\d+)?(?<=\\d)ms|s)$"} } }, "prosody": { "description": "speech prosody", "type":"object", "properties": { "pitch": {"type":"string", "pattern":"^x-low|low|medium|high|x-high|default|(-?(0|[1-9]\\d*)?(\\.\\d+)?(?<=\\d)Hz)$"}, "contour": {"type":"string"}, "range": {"type":"string", "pattern":"^x-low|low|medium|high|x-high|default|(-?(0|[1-9]\\d*)?(\\.\\d+)?(?<=\\d)Hz)$"}, "rate": {"type":"string", "pattern":"^x-slow|slow|medium|fast|x-xfast|default|(-?(0|[1-9]\\d*)?(\\.\\d+)?(?<=\\d)%)$"}, "duration": {"type": "string", "pattern":"^(-?(0|[1-9]\\d*)?(\\.\\d+)?(?<=\\d)ms|s)$"}, "volume": {"type":"string", "pattern":"^silent|x-soft|soft|medium|loud|x-loud|default|(+|-?(0|[1-9]\\d*)?(\\.\\d+)?(?<=\\d)dB)$"} } }, "break": { "description": "break - insert a timed pause", "type":"object", "properties": { "strength": {"type":"string", "enum": ["none","x-weak","weak","medium","strong","x-strong"]}, "time": {"type":"string", "pattern":"^(-?(0|[1-9]\\d*)?(\\.\\d+)?(?<=\\d)ms|s)$"} } }, "audio": { "description":"audio element used to insert audio file into speech stream", "type":"object", "properties":{ "src": {"type":"uri"}, "fetchtimeout":{"type":"string", "pattern":"^(-?(0|[1-9]\\d*)?(\\.\\d+)?(?<=\\d)ms|s)$"}, "fetchint":{"type":"string", "enum": ["safe","prefetch"]}, "maxage":{"type":"string"}, "maxstale":{"type":"string"}, "clipBegin":{"type": "string", "pattern":"^(-?(0|[1-9]\\d*)?(\\.\\d+)?(?<=\\d)ms|s)$"}, "clipEnd":{"type": "string", "pattern":"^(-?(0|[1-9]\\d*)?(\\.\\d+)?(?<=\\d)ms|s)$"}, "repeatCount":{"type":"integer" "repeatDur":{"type": "string", "pattern":"^(-?(0|[1-9]\\d*)?(\\.\\d+)?(?<=\\d)ms|s)$"}, "soundLevel":{"type":"string", "pattern":"^(+|-?(0|[1-9]\\d*)?(\\.\\d+)?(?<=\\d)dB)$"}, "speed":{ "type":"string", "pattern":"^((0|[1-9]\\d*)?(\\.\\d+)?(?<=\\d)%)$"} } } } }