diff --git a/Cargo.toml b/Cargo.toml index e0d80afa..1cd36576 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,7 +8,7 @@ repository = "https://github.com/NSoiffer/MathCAT" homepage = "https://nsoiffer.github.io/MathCAT/" documentation = "https://nsoiffer.github.io/MathCAT/" edition = "2018" -exclude = ["src/main.rs", "docs", "PythonScripts"] # should have "Rules/", but then one can't run build.rs to build the zip file +exclude = ["src/main.rs", "examples/**", "docs", "PythonScripts"] # should have "Rules/", but then one can't run build.rs to build the zip file [features] diff --git a/examples/stateless-example.rs b/examples/stateless-example.rs new file mode 100644 index 00000000..d7464871 --- /dev/null +++ b/examples/stateless-example.rs @@ -0,0 +1,106 @@ +use env_logger; +use lazy_static::lazy_static; +use libmathcat::{errors::Error, stateless_interface::*}; +use std::{ops::Deref, path::Path, sync::mpsc, thread}; + +const EXPR_1: &'static str = r#"sin2x"#; +const EXPR_2: &'static str = r#"cos2x"#; + +struct MathCatHolder { + language: &'static str, + mathcat: MathCat, +} + +// `mathcat` is full of RCs, need to figure out what to do here to explain to Rust +// that it's OK to use a non-mut MathCat across threads because all Rcs are actually +// owned by the `MathCat` instance. +unsafe impl Sync for MathCatHolder {} + +fn build_mathcat(language: &'static str) -> Result { + let rules_dir = std::env::current_exe().unwrap().parent().unwrap().join("../../../Rules"); + let rules_dir = rules_dir.as_os_str().to_str().unwrap().to_string(); + + let mut builder = MathCatBuilder::new(); + builder.set_rules_dir(Path::new(&rules_dir)); + builder.set_pref("Language", language); + Ok(MathCatHolder { language: language, mathcat: builder.build()? }) +} + +fn main() -> Result<(), Error> { + // Run with RUST_LOG=debug to see some debugging information.l + env_logger::builder() + .format_timestamp(None) + .format_module_path(false) + .format_indent(Some(2)) + .format_level(false) + .init(); + + lazy_static! { + static ref mathcat_en: MathCatHolder = build_mathcat("en").unwrap(); + static ref mathcat_es: MathCatHolder = build_mathcat("es").unwrap(); + } + + // Initialization is not thread-safe, ensure everything is initialized: + let _ = mathcat_en.deref(); + let _ = mathcat_es.deref(); + + // Once initialized, MathCat instances are thread-compatible. + let (tx, rx) = mpsc::channel(); + let mut threads = Vec::)>>>>::new(); + { + let tx = tx.clone(); + threads.push(thread::spawn(move || { + tx.send(( + mathcat_en.language, + mathcat_en.mathcat.mathml_to_spoken_text(EXPR_1))) + })); + } + { + let tx = tx.clone(); + threads.push(thread::spawn(move || { + tx.send(( + mathcat_en.language, + mathcat_en.mathcat.mathml_to_spoken_text(EXPR_2))) + })); + } + { + let tx = tx.clone(); + threads.push(thread::spawn(move || { + tx.send(( + mathcat_es.language, + mathcat_es.mathcat.mathml_to_spoken_text(EXPR_1))) + })); + } + { + let tx = tx.clone(); + threads.push(thread::spawn(move || { + tx.send(( + mathcat_es.language, + mathcat_es.mathcat.mathml_to_spoken_text(EXPR_2))) + })); + } + + let rcv_thread = thread::spawn(move || { + let mut pending = 4; + let mut has_errors = false; + while let Ok((language, result)) = rx.recv() { + match result { + Ok(text) => println!("{}: {}", language, text), + Err(e) => { + has_errors = true; + println!("{}: Error!\n{:?}", language, e); + } + }; + pending -= 1; + if pending == 0 { break; } + } + has_errors + }); + + for thread in threads { + let _ = thread.join().unwrap(); + } + let has_errors = rcv_thread.join().unwrap(); + if has_errors { std::process::exit(1); } + Ok(()) +} diff --git a/src/braille.rs b/src/braille.rs index 3c5645ef..b9f5f4a8 100644 --- a/src/braille.rs +++ b/src/braille.rs @@ -25,35 +25,38 @@ pub fn braille_mathml(mathml: Element, nav_node_id: &str) -> Result<(String, usi return BRAILLE_RULES.with(|rules| { rules.borrow_mut().read_files()?; let rules = rules.borrow(); - let new_package = Package::new(); - let mut rules_with_context = SpeechRulesWithContext::new(&rules, new_package.as_document(), nav_node_id); - let braille_string = rules_with_context.match_pattern::(mathml) - .chain_err(|| "Pattern match/replacement failure!")?; - // debug!("braille_mathml: braille string: {}", &braille_string); - let braille_string = braille_string.replace(' ', ""); - let pref_manager = rules_with_context.get_rules().pref_manager.borrow(); - let highlight_style = pref_manager.pref_to_string("BrailleNavHighlight"); - let braille_code = pref_manager.pref_to_string("BrailleCode"); - let braille = match braille_code.as_str() { - "Nemeth" => nemeth_cleanup(pref_manager, braille_string), - "UEB" => ueb_cleanup(pref_manager, braille_string), - "Vietnam" => vietnam_cleanup(pref_manager, braille_string), - "CMU" => cmu_cleanup(pref_manager, braille_string), - "Finnish" => finnish_cleanup(pref_manager, braille_string), - "Swedish" => swedish_cleanup(pref_manager, braille_string), - "LaTeX" => LaTeX_cleanup(pref_manager, braille_string), - "ASCIIMath" => ASCIIMath_cleanup(pref_manager, braille_string), - _ => braille_string.trim_matches('⠀').to_string(), // probably needs cleanup if someone has another code, but this will have to get added by hand - }; + return SPEECH_DEFINITIONS.with_borrow(|definitions| { + let new_package = Package::new(); + let mut rules_with_context = + SpeechRulesWithContext::new(&rules, definitions, new_package.as_document(), nav_node_id); + let braille_string = rules_with_context.match_pattern::(mathml) + .chain_err(|| "Pattern match/replacement failure!")?; + // debug!("braille_mathml: braille string: {}", &braille_string); + let braille_string = braille_string.replace(' ', ""); + let pref_manager = rules_with_context.get_rules().pref_manager.borrow(); + let highlight_style = pref_manager.pref_to_string("BrailleNavHighlight"); + let braille_code = pref_manager.pref_to_string("BrailleCode"); + let braille = match braille_code.as_str() { + "Nemeth" => nemeth_cleanup(pref_manager, braille_string), + "UEB" => ueb_cleanup(pref_manager, braille_string), + "Vietnam" => vietnam_cleanup(pref_manager, braille_string), + "CMU" => cmu_cleanup(pref_manager, braille_string), + "Finnish" => finnish_cleanup(pref_manager, braille_string), + "Swedish" => swedish_cleanup(pref_manager, braille_string), + "LaTeX" => LaTeX_cleanup(pref_manager, braille_string), + "ASCIIMath" => ASCIIMath_cleanup(pref_manager, braille_string), + _ => braille_string.trim_matches('⠀').to_string(), // probably needs cleanup if someone has another code, but this will have to get added by hand + }; - return Ok( - if highlight_style != "Off" { - highlight_braille_chars(braille, &braille_code, highlight_style == "All") - } else { - let end = braille.len()/3; - (braille, 0, end) - } - ); + return Ok( + if highlight_style != "Off" { + highlight_braille_chars(braille, &braille_code, highlight_style == "All") + } else { + let end = braille.len()/3; + (braille, 0, end) + } + ); + }); }); /// highlight with dots 7 & 8 based on the highlight style diff --git a/src/canonicalize.rs b/src/canonicalize.rs index c7117868..767fb6de 100644 --- a/src/canonicalize.rs +++ b/src/canonicalize.rs @@ -6,7 +6,9 @@ //! * known "bad" MathML is cleaned up (this will likely be an ongoing effort) //! * mrows are added based on operator priorities from the MathML Operator Dictionary #![allow(clippy::needless_return)] +use crate::definitions::Definitions; use crate::errors::*; +use crate::prefs::PreferenceManager; use std::rc::Rc; use std::cell::RefCell; use sxd_document::dom::*; @@ -398,33 +400,6 @@ pub fn get_presentation_element(element: Element) -> (usize, Element) { } } -/// Canonicalize does several things: -/// 1. cleans up the tree so all extra white space is removed (should only have element and text nodes) -/// 2. normalize the characters -/// 3. clean up "bad" MathML based on known output from some converters (TODO: still a work in progress) -/// 4. the tree is "parsed" based on the mo (priority)/mi/mn's in an mrow -/// * this adds mrows mrows and some invisible operators (implied times, function app, ...) -/// * extra mrows are removed -/// * implicit mrows are turned into explicit mrows (e.g, there will be a single child of 'math') -/// -/// Canonicalize is pretty conservative in adding new mrows and won't do it if: -/// * there is an intent attr -/// * if the mrow starts and ends with a fence (e.g, French open interval "]0,1[") -/// -/// An mrow is never deleted unless it is redundant. -/// -/// Whitespace handling: -/// Whitespace complicates parsing and also pattern matching (e.g., is it a mixed number which tests for a number preceding a fraction) -/// The first attempt which mostly worked was to shove whitespace into adjacent mi/mn/mtext. That has a problem with distinguish different uses for whitespace -/// The second attempt was to leave it in the parse and make it an mo when appropriate, but there were some cases where it should be prefix and wasn't caught -/// The third attempt (and the current one) is to make it an attribute on adjacent elements. -/// This preserves the data-width attr (with new name) added in the second attempt that helps resolve whether something is tweaking, a real space, or an omission. -/// It adds data-previous-space-width/data-following-space-width with values to indicate with the space was on the left or right (typically it placed on the previous token because that's easier) -pub fn canonicalize(mathml: Element) -> Result { - let context = CanonicalizeContext::new(); - return context.canonicalize(mathml); -} - #[derive(Debug, PartialEq)] enum FunctionNameCertainty { True, @@ -453,6 +428,20 @@ lazy_static! { } +#[derive(PartialEq)] +struct CanonicalizeContextPatternsOptions { + decimal_separator: String, + block_separator: String, +} +impl CanonicalizeContextPatternsOptions { + fn from_prefs(pref_manager: &PreferenceManager) -> CanonicalizeContextPatternsOptions { + return CanonicalizeContextPatternsOptions { + decimal_separator: pref_manager.pref_to_string("BlockSeparators"), + block_separator: pref_manager.pref_to_string("DecimalSeparators"), + } + } +} + struct CanonicalizeContextPatterns { decimal_separator: Regex, block_separator: Regex, @@ -464,13 +453,13 @@ struct CanonicalizeContextPatterns { } impl CanonicalizeContextPatterns { - fn new(block_separator_pref: &str, decimal_separator_pref: &str) -> CanonicalizeContextPatterns { - let block_separator = Regex::new(&format!("[{}]", regex::escape(block_separator_pref))).unwrap(); - let decimal_separator = Regex::new(&format!("[{}]", regex::escape(decimal_separator_pref))).unwrap(); + fn new(options: &CanonicalizeContextPatternsOptions) -> CanonicalizeContextPatterns { + let block_separator = Regex::new(&format!("[{}]", regex::escape(&options.block_separator))).unwrap(); + let decimal_separator = Regex::new(&format!("[{}]", regex::escape(&options.decimal_separator))).unwrap(); // allows just "." and also matches an empty string, but those are ruled out elsewhere - let digit_only_decimal_number = Regex::new(&format!(r"^\d*{}?\d*$", regex::escape(decimal_separator_pref))).unwrap(); - let block_3digit_pattern = get_number_pattern_regex(block_separator_pref, decimal_separator_pref, 3, 3); - let block_3_5digit_pattern = get_number_pattern_regex(block_separator_pref, decimal_separator_pref, 3, 5); + let digit_only_decimal_number = Regex::new(&format!(r"^\d*{}?\d*$", regex::escape(&options.decimal_separator))).unwrap(); + let block_3digit_pattern = get_number_pattern_regex(&options.block_separator, &options.decimal_separator, 3, 3); + let block_3_5digit_pattern = get_number_pattern_regex(&options.block_separator, &options.decimal_separator, 3, 5); // Note: on en.wikipedia.org/wiki/Decimal_separator, show '3.14159 26535 89793 23846' let block_4digit_hex_pattern = Regex::new(r"^[0-9a-fA-F]{4}([ \u00A0\u202F][0-9a-fA-F]{4})*$").unwrap(); let block_1digit_pattern = Regex::new(r"^((\d(\uFFFF\d)?)(\d([, \u00A0\u202F]\d){2})*)?([\.](\d(\uFFFF\d)*)?)?$").unwrap(); @@ -499,9 +488,10 @@ impl CanonicalizeContextPatterns { /// Profiling showed that creating new contexts was very time consuming because creating the RegExs is very expensive /// Profiling set_mathml (which does the canonicalization) spends 65% of the time in Regex::new, of which half of it is spent in this initialization. struct CanonicalizeContextPatternsCache { - block_separator_pref: String, - decimal_separator_pref: String, patterns: Rc, + + // options act as the cache key + options: CanonicalizeContextPatternsOptions, } thread_local!{ @@ -512,12 +502,10 @@ impl CanonicalizeContextPatternsCache { fn new() -> CanonicalizeContextPatternsCache { let pref_manager = crate::prefs::PreferenceManager::get(); let pref_manager = pref_manager.borrow(); - let block_separator_pref = pref_manager.pref_to_string("BlockSeparators"); - let decimal_separator_pref = pref_manager.pref_to_string("DecimalSeparators"); + let options = CanonicalizeContextPatternsOptions::from_prefs(&pref_manager); return CanonicalizeContextPatternsCache { - patterns: Rc::new( CanonicalizeContextPatterns::new(&block_separator_pref, &decimal_separator_pref) ), - block_separator_pref, - decimal_separator_pref + patterns: Rc::new( CanonicalizeContextPatterns::new(&options) ), + options: options, } } @@ -525,34 +513,62 @@ impl CanonicalizeContextPatternsCache { return PATTERN_CACHE.with( |cache| { let pref_manager_rc = crate::prefs::PreferenceManager::get(); let pref_manager = pref_manager_rc.borrow(); - let block_separator_pref = pref_manager.pref_to_string("BlockSeparators"); - let decimal_separator_pref = pref_manager.pref_to_string("DecimalSeparators"); + let new_options = CanonicalizeContextPatternsOptions::from_prefs(&pref_manager); let mut cache = cache.borrow_mut(); - if block_separator_pref != cache.block_separator_pref || decimal_separator_pref != cache.decimal_separator_pref { + if new_options != cache.options { // update the cache - cache.patterns = Rc::new( CanonicalizeContextPatterns::new(&block_separator_pref, &decimal_separator_pref) ); - cache.block_separator_pref = block_separator_pref; - cache.decimal_separator_pref = decimal_separator_pref; + cache.patterns = Rc::new( CanonicalizeContextPatterns::new(&new_options) ); + cache.options = new_options; } return cache.patterns.clone(); }) } } -struct CanonicalizeContext { +pub (crate) struct CanonicalizeContext { patterns: Rc, } impl CanonicalizeContext { - fn new() -> CanonicalizeContext { + pub fn new_uncached(pref_manager: &PreferenceManager) -> CanonicalizeContext { + return CanonicalizeContext { + patterns: Rc::new( CanonicalizeContextPatterns::new( + &CanonicalizeContextPatternsOptions::from_prefs(pref_manager)) ), + }; + } + + /// Returns the context backed by global preferences and cache. + pub fn new_from_global_prefs_cached() -> CanonicalizeContext { return CanonicalizeContext { patterns: CanonicalizeContextPatternsCache::get(), }; } - fn canonicalize<'a>(&self, mut mathml: Element<'a>) -> Result> { + /// Canonicalize does several things: + /// 1. cleans up the tree so all extra white space is removed (should only have element and text nodes) + /// 2. normalize the characters + /// 3. clean up "bad" MathML based on known output from some converters (TODO: still a work in progress) + /// 4. the tree is "parsed" based on the mo (priority)/mi/mn's in an mrow + /// * this adds mrows mrows and some invisible operators (implied times, function app, ...) + /// * extra mrows are removed + /// * implicit mrows are turned into explicit mrows (e.g, there will be a single child of 'math') + /// + /// Canonicalize is pretty conservative in adding new mrows and won't do it if: + /// * there is an intent attr + /// * if the mrow starts and ends with a fence (e.g, French open interval "]0,1[") + /// + /// An mrow is never deleted unless it is redundant. + /// + /// Whitespace handling: + /// Whitespace complicates parsing and also pattern matching (e.g., is it a mixed number which tests for a number preceding a fraction) + /// The first attempt which mostly worked was to shove whitespace into adjacent mi/mn/mtext. That has a problem with distinguish different uses for whitespace + /// The second attempt was to leave it in the parse and make it an mo when appropriate, but there were some cases where it should be prefix and wasn't caught + /// The third attempt (and the current one) is to make it an attribute on adjacent elements. + /// This preserves the data-width attr (with new name) added in the second attempt that helps resolve whether something is tweaking, a real space, or an omission. + /// It adds data-previous-space-width/data-following-space-width with values to indicate with the space was on the left or right (typically it placed on the previous token because that's easier) + pub fn canonicalize<'a>(&self, definitions: &Definitions, mut mathml: Element<'a>) -> Result> { // debug!("MathML before canonicalize:\n{}", mml_to_string(mathml)); if name(mathml) != "math" { @@ -566,15 +582,15 @@ impl CanonicalizeContext { mathml = root.children()[0].element().unwrap(); } CanonicalizeContext::assure_mathml(mathml)?; - let mathml = self.clean_mathml(mathml).unwrap(); // 'math' is never removed + let mathml = self.clean_mathml(definitions, mathml).unwrap(); // 'math' is never removed self.assure_nary_tag_has_one_child(mathml); // debug!("Not chemistry -- retry:\n{}", mml_to_string(mathml)); - let mut converted_mathml = self.canonicalize_mrows(mathml) + let mut converted_mathml = self.canonicalize_mrows(definitions, mathml) .chain_err(|| format!("while processing\n{}", mml_to_string(mathml)))?; if !crate::chemistry::scan_and_mark_chemistry(converted_mathml) { // debug!("canonicalize before canonicalize_mrows:\n{}", mml_to_string(converted_mathml)); self.assure_nary_tag_has_one_child(converted_mathml); - converted_mathml = self.canonicalize_mrows(mathml) + converted_mathml = self.canonicalize_mrows(definitions, mathml) .chain_err(|| format!("while processing\n{}", mml_to_string(mathml)))?; } debug!("\nMathML after canonicalize:\n{}", mml_to_string(converted_mathml)); @@ -739,7 +755,7 @@ impl CanonicalizeContext { /// "arg trig" functions, pseudo scripts, and others /// /// Returns 'None' if the element should not be in the tree. - fn clean_mathml<'a>(&self, mathml: Element<'a>) -> Option> { + fn clean_mathml<'a>(&self, definitions: &Definitions, mathml: Element<'a>) -> Option> { // Note: this works bottom-up (clean the children first, then this element) lazy_static! { static ref IS_PRIME: Regex = Regex::new(r"['′″‴⁗]").unwrap(); @@ -858,7 +874,7 @@ impl CanonicalizeContext { as_element(following_siblings[0]).remove_from_parent(); } return Some(mathml); - } else if let Some(result) = merge_arc_trig(mathml) { + } else if let Some(result) = merge_arc_trig(definitions, mathml) { return Some(result); } else if IS_PRIME.is_match(text) { let new_text = merge_prime_text(text); @@ -867,9 +883,9 @@ impl CanonicalizeContext { } else if text == "..." { mathml.set_text("…"); return Some(mathml); - } else if let Some(result) = split_points(mathml) { + } else if let Some(result) = split_points(definitions, mathml) { return Some(result); - } else if let Some(result) = merge_mi_sequence(mathml) { + } else if let Some(result) = merge_mi_sequence(definitions, mathml) { return Some(result); } else { return Some(mathml); @@ -878,11 +894,11 @@ impl CanonicalizeContext { "mtext" => { // debug!("before merge_arc_trig: {}", mml_to_string(mathml)); - if let Some(result) = merge_arc_trig(mathml) { + if let Some(result) = merge_arc_trig(definitions, mathml) { return Some(result); }; - if let Some(result) = split_points(mathml) { + if let Some(result) = split_points(definitions, mathml) { return Some(result); } @@ -919,7 +935,7 @@ impl CanonicalizeContext { } else { match text { "arc" | "arc " | "arc " /* non-breaking space */ => { - if let Some(result) = merge_arc_trig(mathml) { + if let Some(result) = merge_arc_trig(definitions, mathml) { return Some(result); } }, @@ -942,27 +958,25 @@ impl CanonicalizeContext { // common bug: trig functions, lim, etc., should be mi // same for ellipsis ("…") - return crate::definitions::SPEECH_DEFINITIONS.with(|definitions| { - if ["…", "⋯", "∞"].contains(&text) || - definitions.borrow().get_hashset("FunctionNames").unwrap().contains(text) || - definitions.borrow().get_hashset("GeometryShapes").unwrap().contains(text) { - set_mathml_name(mathml, "mi"); - return Some(mathml); - } - if IS_PRIME.is_match(text) { - let new_text = merge_prime_text(text); - mathml.set_text(&new_text); - return Some(mathml); - } - if CURRENCY_SYMBOLS.contains(text) { - set_mathml_name(mathml, "mi"); - return Some(mathml); - } + if ["…", "⋯", "∞"].contains(&text) || + definitions.get_hashset("FunctionNames").unwrap().contains(text) || + definitions.get_hashset("GeometryShapes").unwrap().contains(text) { + set_mathml_name(mathml, "mi"); + return Some(mathml); + } + if IS_PRIME.is_match(text) { + let new_text = merge_prime_text(text); + mathml.set_text(&new_text); return Some(mathml); - }); + } + if CURRENCY_SYMBOLS.contains(text) { + set_mathml_name(mathml, "mi"); + return Some(mathml); + } + return Some(mathml); // note: chemistry test is done later as part of another phase of chemistry cleanup }, - "mfenced" => {return self.clean_mathml( convert_mfenced_to_mrow(mathml) )}, + "mfenced" => {return self.clean_mathml(definitions, convert_mfenced_to_mrow(mathml))}, "mstyle" | "mpadded" => { // Throw out mstyle and mpadded -- to do this, we need to avoid mstyle being the arg of clean_mathml // FIX: should probably push the attrs down to the children (set in 'self') @@ -972,7 +986,7 @@ impl CanonicalizeContext { return if parent_requires_child {Some( CanonicalizeContext::make_empty_element(mathml) )} else {None}; } else if children.len() == 1 { let is_from_mhchem = element_name == "mpadded" && is_from_mhchem_hack(mathml); - if let Some(new_mathml) = self.clean_mathml( as_element(children[0]) ) { + if let Some(new_mathml) = self.clean_mathml(definitions, as_element(children[0])) { // "lift" the child up so all the links (e.g., siblings) are correct mathml.replace_children(new_mathml.children()); set_mathml_name(mathml, name(new_mathml)); @@ -992,7 +1006,7 @@ impl CanonicalizeContext { // wrap the children in an mrow, but maintain tree siblings by changing mpadded/mstyle to mrow set_mathml_name(mathml, "mrow"); mathml.set_attribute_value(CHANGED_ATTR, ADDED_ATTR_VALUE); - return self.clean_mathml(mathml); // now it's an mrow so a different path next time + return self.clean_mathml(definitions, mathml); // now it's an mrow so a different path next time } }, "mphantom" | "malignmark" | "maligngroup"=> { @@ -1016,7 +1030,7 @@ impl CanonicalizeContext { // The compromise is to move the annotations into an attr named data-annotation[-xml]- // The attribute is put on presentation element root let presentation = get_presentation_element(mathml).1; - let new_presentation = if let Some(presentation) = self.clean_mathml(presentation) { + let new_presentation = if let Some(presentation) = self.clean_mathml(definitions, presentation) { presentation } else { // probably shouldn't happen, but just in case @@ -1033,7 +1047,7 @@ impl CanonicalizeContext { return if parent_requires_child {Some(mathml)} else {None}; } else if children.len() == 1 && CanonicalizeContext::is_ok_to_merge_mrow_child(mathml) { let is_from_mhchem = is_from_mhchem_hack(mathml); - if let Some(new_mathml) = self.clean_mathml(as_element(children[0])) { + if let Some(new_mathml) = self.clean_mathml(definitions, as_element(children[0])) { // "lift" the child up so all the links (e.g., siblings) are correct mathml.replace_children(new_mathml.children()); set_mathml_name(mathml, name(new_mathml)); @@ -1067,7 +1081,7 @@ impl CanonicalizeContext { let mut i = 0; while i < children.len() { if let Some(child) = children[i].element() { - match self.clean_mathml(child) { + match self.clean_mathml(definitions, child) { None => { mathml.remove_child(child); // don't increment 'i' because there is one less child now and so everything shifted left @@ -1094,7 +1108,7 @@ impl CanonicalizeContext { // crate::canonicalize::assure_mathml(get_parent(start_of_change)).unwrap(); // FIX: find a recovery -- we're in deep trouble if this isn't true if start_of_change != child { // debug!("clean_mathml: start_of_change != mathml -- mathml={}", mml_to_string(mathml)); - return self.clean_mathml(mathml); // restart cleaning + return self.clean_mathml(definitions, mathml); // restart cleaning } } i += 1; @@ -1531,7 +1545,7 @@ impl CanonicalizeContext { } /// If arg is "arc" (with optional space), merge the following element in if a trig function (sibling is deleted) - fn merge_arc_trig(leaf: Element) -> Option { + fn merge_arc_trig<'a>(definitions: &Definitions, leaf: Element<'a>) -> Option> { assert!(is_leaf(leaf)); let leaf_text = as_text(leaf); if !(leaf_text == "arc" || leaf_text == "arc " || leaf_text == "arc " /* non-breaking space */ ) { @@ -1549,18 +1563,16 @@ impl CanonicalizeContext { return None; } - return crate::definitions::SPEECH_DEFINITIONS.with(|definitions| { - // change "arc" "cos" to "arccos" -- we look forward because calling loop stores previous node - let following_text = as_text(following_sibling); - if definitions.borrow().get_hashset("TrigFunctionNames").unwrap().contains(following_text) { - let new_text = "arc".to_string() + following_text; - set_mathml_name(leaf, "mi"); - leaf.set_text(&new_text); - following_sibling.remove_from_parent(); - return Some(leaf); - } - return None; - }) + // change "arc" "cos" to "arccos" -- we look forward because calling loop stores previous node + let following_text = as_text(following_sibling); + if definitions.get_hashset("TrigFunctionNames").unwrap().contains(following_text) { + let new_text = "arc".to_string() + following_text; + set_mathml_name(leaf, "mi"); + leaf.set_text(&new_text); + following_sibling.remove_from_parent(); + return Some(leaf); + } + return None; } /// Convert "||" to "‖", if in single element or in repeated 'mo's (but not "|x||y|" or "{x ||x|>0}") @@ -1666,7 +1678,7 @@ impl CanonicalizeContext { return UPPER_ROMAN_NUMERAL.is_match(text) || LOWER_ROMAN_NUMERAL.is_match(text); } - /// Return true if 'element' (which is syntactically a roman numeral) is only inside mrows and + /// Return true if 'alement' (which is syntactically a roman numeral) is only inside mrows and /// if its length is < 3 chars, then there is another roman numeral near it (separated by an operator). /// We want to rule out something like 'm' or 'cm' being a roman numeral. /// Note: this function assumes 'mathml' is a Roman Numeral, and optimizes operations based on that. @@ -1927,7 +1939,7 @@ impl CanonicalizeContext { /// If we have something like 'shape' ABC, we split the ABC and add IMPLIED_SEPARATOR_HIGH_PRIORITY between them /// under some specific conditions (trying to be a little cautious). /// The returned (mrow) element reuses the arg so tree siblings links remain correct. - fn split_points(leaf: Element) -> Option { + fn split_points<'a>(definitions: &Definitions, leaf: Element<'a>) -> Option> { lazy_static!{ static ref IS_UPPERCASE: Regex = Regex::new(r"^[A-Z]+$").unwrap(); } @@ -1958,17 +1970,14 @@ impl CanonicalizeContext { let preceding_sibling_name = name(preceding_sibling); if preceding_sibling_name == "mi" || preceding_sibling_name == "mo" || preceding_sibling_name == "mtext" { let preceding_text = as_text(preceding_sibling); - return crate::definitions::SPEECH_DEFINITIONS.with(|definitions| { - let defs = definitions.borrow(); - let prefix_ops = defs.get_hashset("GeometryPrefixOperators").unwrap(); - let shapes = defs.get_hashset("GeometryShapes").unwrap(); - if prefix_ops.contains(preceding_text) || shapes.contains(preceding_text) { - // split leaf - return Some( split_element(leaf) ); // always treated as function names - } else { - return None; - } - }) + let prefix_ops = definitions.get_hashset("GeometryPrefixOperators").unwrap(); + let shapes = definitions.get_hashset("GeometryShapes").unwrap(); + if prefix_ops.contains(preceding_text) || shapes.contains(preceding_text) { + // split leaf + return Some( split_element(leaf) ); // always treated as function names + } else { + return None; + } } } return None; @@ -1989,7 +1998,7 @@ impl CanonicalizeContext { /// If we have something like 'V e l o c i t y', merge that into a single /// We only do this for sequences of at least three chars, and also exclude things like consecutive letter (e.g., 'x y z') /// The returned (mi) element reuses 'mi' - fn merge_mi_sequence(mi: Element) -> Option { + fn merge_mi_sequence<'a>(definitions: &Definitions, mi: Element<'a>) -> Option> { // The best solution would be to use a dictionary of words, or maybe restricted to words in a formula, // but that would likely miss the words used in slope=run/rise // We shouldn't need to worry about trig names like "cos", but people sometimes forget to use "\cos" @@ -2053,20 +2062,14 @@ impl CanonicalizeContext { following_mi_siblings.push(last); } // debug!("merge_mi_sequence: text={}", &text); - if let Some(answer) = crate::definitions::SPEECH_DEFINITIONS.with(|definitions| { - let definitions = definitions.borrow(); - if definitions.get_hashset("FunctionNames").unwrap().contains(&text) { - return Some(merge_from_text(mi, &text, &following_mi_siblings)); - } - // unlike "FunctionNames", "KnownWords" might not exist - if let Some(word_map) = definitions.get_hashset("KnownWords") { - if word_map.contains(&text) { - return Some(merge_from_text(mi, &text, &following_mi_siblings)); - } + if definitions.get_hashset("FunctionNames").unwrap().contains(&text) { + return merge_from_text(mi, &text, &following_mi_siblings); + } + // unlike "FunctionNames", "KnownWords" might not exist + if let Some(word_map) = definitions.get_hashset("KnownWords") { + if word_map.contains(&text) { + return merge_from_text(mi, &text, &following_mi_siblings); } - return None; - }) { - return answer; } // don't be too aggressive combining mi's when they are short @@ -2797,7 +2800,7 @@ impl CanonicalizeContext { } } - fn canonicalize_mrows<'a>(&self, mathml: Element<'a>) -> Result> { + fn canonicalize_mrows<'a>(&self, definitions: &Definitions, mathml: Element<'a>) -> Result> { let tag_name = name(mathml); set_mathml_name(mathml, tag_name); // add namespace match tag_name { @@ -2814,7 +2817,7 @@ impl CanonicalizeContext { return Ok( mathml ); }, "mrow" => { - return self.canonicalize_mrows_in_mrow(mathml); + return self.canonicalize_mrows_in_mrow(definitions, mathml); }, _ => { // recursively try to make mrows in other structures (eg, num/denom in fraction) @@ -2822,7 +2825,7 @@ impl CanonicalizeContext { for child in mathml.children() { match child { ChildOfElement::Element(e) => { - new_children.push( ChildOfElement::Element(self.canonicalize_mrows(e)? )); + new_children.push( ChildOfElement::Element(self.canonicalize_mrows(definitions, e)? )); }, ChildOfElement::Text(t) => { if mathml.children().len() != 1 { @@ -3172,7 +3175,7 @@ impl CanonicalizeContext { // e.g., n!!n -- ((n!)!)*n or (n!)*(!n) -- the latter doesn't make semantic sense though // FIX: the above ignores mspace and other nodes that need to be skipped to determine the right node to determine airity // FIX: the postfix problem above should be addressed - fn find_operator<'a>(context: Option<&CanonicalizeContext>, mo_node: Element<'a>, previous_operator: Option<&'static OperatorInfo>, + fn find_operator<'a>(context_and_definitions: Option<(&CanonicalizeContext, &Definitions)>, mo_node: Element<'a>, previous_operator: Option<&'static OperatorInfo>, previous_node: Option>, next_node: Option>) -> &'static OperatorInfo { // get the unicode value and return the OpKeyword associated with it assert!( name(mo_node) == "mo"); @@ -3180,9 +3183,9 @@ impl CanonicalizeContext { // if a form has been given, that takes precedence let form = mo_node.attribute_value("form"); let op_type = match form { - None => match context { + None => match context_and_definitions { None => OperatorTypes::POSTFIX, // what compute_type_from_position returns when the other args to this are all None - Some(context) => compute_type_from_position(context, previous_operator, previous_node, next_node), + Some((context, definitions)) => compute_type_from_position(context, definitions, previous_operator, previous_node, next_node), }, Some(form) => match form.to_lowercase().as_str() { "prefix" => OperatorTypes::PREFIX, @@ -3210,7 +3213,7 @@ impl CanonicalizeContext { } - fn compute_type_from_position<'a>(context: &CanonicalizeContext, previous_operator: Option<&'static OperatorInfo>, previous_node: Option>, next_node: Option>) -> OperatorTypes { + fn compute_type_from_position<'a>(context: &CanonicalizeContext, definitions: &Definitions, previous_operator: Option<&'static OperatorInfo>, previous_node: Option>, next_node: Option>) -> OperatorTypes { // based on choices, pick one that fits the context // if there isn't an obvious one, we have parsed the left, but not the right, so discount that @@ -3219,11 +3222,11 @@ impl CanonicalizeContext { // Need to be careful because (sin - cos)(x) needs an infix '-' // Return either the prefix or infix version of the operator if next_node.is_some() && - context.is_function_name(get_possible_embellished_node(next_node.unwrap()), None) == FunctionNameCertainty::True { + context.is_function_name(definitions, get_possible_embellished_node(next_node.unwrap()), None) == FunctionNameCertainty::True { return OperatorTypes::INFIX; } if previous_node.is_some() && - context.is_function_name(get_possible_embellished_node(previous_node.unwrap()), None) == FunctionNameCertainty::True { + context.is_function_name(definitions, get_possible_embellished_node(previous_node.unwrap()), None) == FunctionNameCertainty::True { return OperatorTypes::PREFIX; } @@ -3286,7 +3289,10 @@ impl CanonicalizeContext { } - fn determine_vertical_bar_op<'a>(&self, original_op: &'static OperatorInfo, mo_node: Element<'a>, + fn determine_vertical_bar_op<'a>(&self, + definitions: &Definitions, + original_op: &'static OperatorInfo, + mo_node: Element<'a>, next_child: Option>, parse_stack: &'a mut Vec, n_vertical_bars_on_right: usize) -> &'static OperatorInfo { @@ -3356,7 +3362,7 @@ impl CanonicalizeContext { } else { let next_next_children = next_child.following_siblings(); let next_next_child = if next_next_children.is_empty() { None } else { Some( as_element(next_next_children[0]) )}; - Some( CanonicalizeContext::find_operator(Some(self), next_child, operator_versions.infix, + Some( CanonicalizeContext::find_operator(Some((self, definitions)), next_child, operator_versions.infix, top(parse_stack).last_child_in_mrow(), next_next_child) ) }; @@ -3417,7 +3423,7 @@ impl CanonicalizeContext { // 2. If there are no parens, then only names on the known function list are used (e.g., "sin x") // // If the name if followed by parens but doesn't fit into the above categories, we return a "maybe" - fn is_function_name<'a>(&self, node: Element<'a>, right_siblings: Option<&[ChildOfElement<'a>]>) -> FunctionNameCertainty { + fn is_function_name<'a>(&self, definitions: &Definitions, node: Element<'a>, right_siblings: Option<&[ChildOfElement<'a>]>) -> FunctionNameCertainty { let base_of_name = get_possible_embellished_node(node); // actually only 'mi' should be legal here, but some systems used 'mtext' for multi-char variables @@ -3432,104 +3438,102 @@ impl CanonicalizeContext { return FunctionNameCertainty::False; } // debug!(" is_function_name({}), {} following nodes", base_name, if right_siblings.is_none() {"No".to_string()} else {right_siblings.unwrap().len().to_string()}); - return crate::definitions::SPEECH_DEFINITIONS.with(|defs| { - // names that are always function names (e.g, "sin" and "log") - let defs = defs.borrow(); - let names = defs.get_hashset("FunctionNames").unwrap(); - // UEB seems to think "Sin" (etc) is used for "sin", so we move to lower case - if names.contains(&base_name.to_ascii_lowercase()) { - // debug!(" ...is in FunctionNames"); - return FunctionNameCertainty::True; // always treated as function names - } - - // We include shapes as function names so that △ABC makes sense since △ and - // the other shapes are not in the operator dictionary - let shapes = defs.get_hashset("GeometryShapes").unwrap(); - if shapes.contains(base_name) { - return FunctionNameCertainty::True; // always treated as function names - } - if right_siblings.is_none() { - return FunctionNameCertainty::False; // only accept known names, which is tested above - } + // names that are always function names (e.g, "sin" and "log") + let names = definitions.get_hashset("FunctionNames").unwrap(); + // UEB seems to think "Sin" (etc) is used for "sin", so we move to lower case + if names.contains(&base_name.to_ascii_lowercase()) { + // debug!(" ...is in FunctionNames"); + return FunctionNameCertainty::True; // always treated as function names + } - // make sure that what follows starts and ends with parens/brackets - assert_eq!(name(get_parent(node)), "mrow"); - let right_siblings = right_siblings.unwrap(); - let non_whitespace = right_siblings.iter().enumerate() - .find(|&(_, child)| { - let child = as_element(*child); - name(child) != "mtext" || !as_text(child).trim().is_empty() - }); - let right_siblings = if let Some( (i, _) ) = non_whitespace {&right_siblings[i..]} else {right_siblings}; - if right_siblings.is_empty() { - // debug!(" ...right siblings not None, but zero of them"); - return FunctionNameCertainty::False; - } + // We include shapes as function names so that △ABC makes sense since △ and + // the other shapes are not in the operator dictionary + let shapes = definitions.get_hashset("GeometryShapes").unwrap(); + if shapes.contains(base_name) { + return FunctionNameCertainty::True; // always treated as function names + } - let first_child = as_element(right_siblings[0]); - - // clean_chemistry wrapped up a state in an mrow and this is assumed by is_likely_chemical_state() - let chem_state_certainty = self.is_likely_chemical_state(node, first_child); - if chem_state_certainty != FunctionNameCertainty::True { - // debug!(" ...is_likely_chemical_state says it is a function ={:?}", chem_state_certainty); - return chem_state_certainty; - } + if right_siblings.is_none() { + return FunctionNameCertainty::False; // only accept known names, which is tested above + } - if name(first_child) == "mrow" && is_left_paren(as_element(first_child.children()[0])) { - // debug!(" ...trying again after expanding mrow"); - return self.is_function_name(node, Some(&first_child.children())); - } + // make sure that what follows starts and ends with parens/brackets + assert_eq!(name(get_parent(node)), "mrow"); + let right_siblings = right_siblings.unwrap(); + let non_whitespace = right_siblings.iter().enumerate() + .find(|&(_, child)| { + let child = as_element(*child); + name(child) != "mtext" || !as_text(child).trim().is_empty() + }); + let right_siblings = if let Some( (i, _) ) = non_whitespace {&right_siblings[i..]} else {right_siblings}; + if right_siblings.is_empty() { + // debug!(" ...right siblings not None, but zero of them"); + return FunctionNameCertainty::False; + } - if right_siblings.len() < 2 { - // debug!(" ...not enough right siblings"); - return FunctionNameCertainty::False; // can't be (...) - } + let first_child = as_element(right_siblings[0]); + + // clean_chemistry wrapped up a state in an mrow and this is assumed by is_likely_chemical_state() + let chem_state_certainty = self.is_likely_chemical_state(node, first_child); + if chem_state_certainty != FunctionNameCertainty::True { + // debug!(" ...is_likely_chemical_state says it is a function ={:?}", chem_state_certainty); + return chem_state_certainty; + } - // at least two siblings are this point -- check that they are parens/brackets - // we can only check the open paren/bracket because the right side is unparsed and we don't know the close location - let first_sibling = as_element(right_siblings[0]); - if name(first_sibling) != "mo" || !is_left_paren(first_sibling) // '(' or '[' - { - // debug!(" ...first sibling is not '(' or '['"); - return FunctionNameCertainty::False; - } - - let likely_names = defs.get_hashset("LikelyFunctionNames").unwrap(); - if likely_names.contains(base_name) { - return FunctionNameCertainty::True; // don't bother checking contents of parens, consider these as function names - } - - if is_single_arg(as_text(first_sibling), &right_siblings[1..]) { - // debug!(" ...is single arg"); - return FunctionNameCertainty::True; // if there is only a single arg, why else would you use parens? - }; + if name(first_child) == "mrow" && is_left_paren(as_element(first_child.children()[0])) { + // debug!(" ...trying again after expanding mrow"); + return self.is_function_name(definitions, node, Some(&first_child.children())); + } - if is_comma_arg(as_text(first_sibling), &right_siblings[1..]) { - // debug!(" ...is comma arg"); - return FunctionNameCertainty::True; // if there is only a single arg, why else would you use parens? - }; - - // FIX: should really make sure all the args are marked as MAYBE_CHEMISTRY, but we don't know the matching close paren/bracket - if node.attribute(MAYBE_CHEMISTRY).is_some() && - as_element(right_siblings[1]).attribute(MAYBE_CHEMISTRY).is_some() { - return FunctionNameCertainty::False; - } - - // Names like "Tr" are likely function names, single letter names like "M" or "J" are iffy - // This needs to be after the chemical state check above to rule out Cl(g), etc - // This would be better if if were part of 'likely_names' as "[A-Za-z]+", but reg exprs don't work in HashSets. - // FIX: create our own struct and write appropriate traits for it and then it could work - let mut chars = base_name.chars(); - let first_char = chars.next().unwrap(); // we know there is at least one byte in it, hence one char - if chars.next().is_some() && first_char.is_uppercase() { - // debug!(" ...is uppercase name"); - return FunctionNameCertainty::True; - } - - // debug!(" ...didn't match options to be a function"); - return FunctionNameCertainty::Maybe; // didn't fit one of the above categories - }); + if right_siblings.len() < 2 { + // debug!(" ...not enough right siblings"); + return FunctionNameCertainty::False; // can't be (...) + } + + // at least two siblings are this point -- check that they are parens/brackets + // we can only check the open paren/bracket because the right side is unparsed and we don't know the close location + let first_sibling = as_element(right_siblings[0]); + if name(first_sibling) != "mo" || !is_left_paren(first_sibling) // '(' or '[' + { + // debug!(" ...first sibling is not '(' or '['"); + return FunctionNameCertainty::False; + } + + let likely_names = definitions.get_hashset("LikelyFunctionNames").unwrap(); + if likely_names.contains(base_name) { + return FunctionNameCertainty::True; // don't bother checking contents of parens, consider these as function names + } + + if is_single_arg(as_text(first_sibling), &right_siblings[1..]) { + // debug!(" ...is single arg"); + return FunctionNameCertainty::True; // if there is only a single arg, why else would you use parens? + }; + + if is_comma_arg(as_text(first_sibling), &right_siblings[1..]) { + // debug!(" ...is comma arg"); + return FunctionNameCertainty::True; // if there is only a single arg, why else would you use parens? + }; + + // FIX: should really make sure all the args are marked as MAYBE_CHEMISTRY, but we don't know the matching close paren/bracket + if node.attribute(MAYBE_CHEMISTRY).is_some() && + as_element(right_siblings[1]).attribute(MAYBE_CHEMISTRY).is_some() { + return FunctionNameCertainty::False; + } + + // Names like "Tr" are likely function names, single letter names like "M" or "J" are iffy + // This needs to be after the chemical state check above to rule out Cl(g), etc + // This would be better if if were part of 'likely_names' as "[A-Za-z]+", but reg exprs don't work in HashSets. + // FIX: create our own struct and write appropriate traits for it and then it could work + let mut chars = base_name.chars(); + let first_char = chars.next().unwrap(); // we know there is at least one byte in it, hence one char + if chars.next().is_some() && first_char.is_uppercase() { + // debug!(" ...is uppercase name"); + return FunctionNameCertainty::True; + } + + // debug!(" ...didn't match options to be a function"); + return FunctionNameCertainty::Maybe; // didn't fit one of the above categories fn is_single_arg(open: &str, following_nodes: &[ChildOfElement]) -> bool { // following_nodes are nodes after "(" @@ -3594,7 +3598,7 @@ impl CanonicalizeContext { } } - fn is_mixed_fraction<'a>(&self, integer_part: Element<'a>, fraction_children: &[ChildOfElement<'a>]) -> Result { + fn is_mixed_fraction<'a>(&self, definitions: &Definitions, integer_part: Element<'a>, fraction_children: &[ChildOfElement<'a>]) -> Result { // do some simple disqualifying checks on the fraction part if fraction_children.is_empty() { return Ok( false ); @@ -3615,7 +3619,7 @@ impl CanonicalizeContext { return Ok( is_mfrac_ok(right_child) ); } - return is_linear_fraction(self, fraction_children); + return is_linear_fraction(self, definitions, fraction_children); fn is_int(integer_part: Element) -> bool { @@ -3653,7 +3657,7 @@ impl CanonicalizeContext { return is_int(denominator); } - fn is_linear_fraction(canonicalize: &CanonicalizeContext, fraction_children: &[ChildOfElement]) -> Result { + fn is_linear_fraction(canonicalize: &CanonicalizeContext, definitions: &Definitions, fraction_children: &[ChildOfElement]) -> Result { // two possibilities // 1. '3 / 4' is in an mrow // 2. '3 / 4' are three separate elements @@ -3662,7 +3666,7 @@ impl CanonicalizeContext { if first_child.children().len() != 3 { return Ok( false ); } - return is_linear_fraction(canonicalize, &first_child.children()) + return is_linear_fraction(canonicalize, definitions, &first_child.children()) } @@ -3672,9 +3676,9 @@ impl CanonicalizeContext { if !is_int(first_child) { return Ok( false ); } - let slash_part = canonicalize.canonicalize_mrows(as_element(fraction_children[1]))?; + let slash_part = canonicalize.canonicalize_mrows(definitions, as_element(fraction_children[1]))?; if name(slash_part) == "mo" && as_text(slash_part) == "/" { - let denom = canonicalize.canonicalize_mrows(as_element(fraction_children[2]))?; + let denom = canonicalize.canonicalize_mrows(definitions, as_element(fraction_children[2]))?; return Ok( is_int(denom) ); } return Ok( false ); @@ -3758,7 +3762,8 @@ impl CanonicalizeContext { // Add the current operator if it's not n-ary to the stack // 'current_child' and it the operator to the stack. fn shift_stack<'s, 'a:'s, 'op:'a>( - &self, parse_stack: &'s mut Vec>, + &self, definitions: &Definitions, + parse_stack: &'s mut Vec>, current_child: Element<'a>, current_op: OperatorPair<'op>) -> (Element<'a>, OperatorPair<'op>) { let mut new_current_child = current_child; @@ -3790,7 +3795,7 @@ impl CanonicalizeContext { let children = mrow.children(); // debug!("looking for left fence: len={}, {:#?}", children.len(), self.find_operator(as_element(children[0]),None, None, Some(as_element(children[1])) )); if children.len() == 2 && (name(as_element(children[0])) != "mo" || - !CanonicalizeContext::find_operator(Some(self), as_element(children[0]), + !CanonicalizeContext::find_operator(Some((self, definitions)), as_element(children[0]), None, Some(as_element(children[0])), Some(mrow) ).is_left_fence()) { // the mrow did *not* start with an open (hence no push) // since parser really wants balanced parens to keep stack state right, we do a push here @@ -3852,7 +3857,7 @@ impl CanonicalizeContext { return prev_priority; } - fn is_trig_arg<'a, 'op:'a>(&self, previous_child: Element<'a>, current_child: Element<'a>, parse_stack: &mut Vec>) -> bool { + fn is_trig_arg<'a, 'op:'a>(&self, definitions: &Definitions, previous_child: Element<'a>, current_child: Element<'a>, parse_stack: &mut Vec>) -> bool { // We have operand-operand and know we want multiplication at this point. // Check for special case where we want multiplication to bind more tightly than function app (e.g, sin 2x, sin -2xy) // We only want to do this for simple args @@ -3869,7 +3874,7 @@ impl CanonicalizeContext { // Use lower priority multiplication if current_child is a function (e.g. "cos" in "sin x cos 3y") // if !is_trig(current_child) { - if self.is_function_name(current_child, None) == FunctionNameCertainty::True { + if self.is_function_name(definitions, current_child, None) == FunctionNameCertainty::True { return false; } // Three cases: @@ -3880,7 +3885,7 @@ impl CanonicalizeContext { let op_on_top = &top(parse_stack).op_pair; if ptr_eq(op_on_top.op, *INVISIBLE_FUNCTION_APPLICATION) { let function_element = as_element(top(parse_stack).mrow.children()[0]); - return is_trig(function_element); + return is_trig(definitions, function_element); } if ptr_eq(op_on_top.op, *PREFIX_MINUS) { if parse_stack.len() < 2 { @@ -3891,7 +3896,7 @@ impl CanonicalizeContext { return false; } let function_element = as_element(next_stack_info.mrow.children()[0]); - if is_trig(function_element) { + if is_trig(definitions, function_element) { // want '- 2' to be an mrow; don't want '- 2 x ...' to be the mrow (IMPLIED_TIMES_HIGH_PRIORITY is an internal hack) self.reduce_stack_one_time(parse_stack); return true; @@ -3900,7 +3905,7 @@ impl CanonicalizeContext { } return ptr_eq(op_on_top.op, &*IMPLIED_TIMES_HIGH_PRIORITY); - fn is_trig(node: Element) -> bool { + fn is_trig(definitions: &Definitions, node: Element) -> bool { let base_of_name = get_possible_embellished_node(node); // actually only 'mi' should be legal here, but some systems used 'mtext' for multi-char variables @@ -3913,13 +3918,10 @@ impl CanonicalizeContext { if base_name.is_empty() { return false; } - return crate::definitions::SPEECH_DEFINITIONS.with(|defs| { - // names that are always function names (e.g, "sin" and "log") - let defs = defs.borrow(); - let names = defs.get_hashset("TrigFunctionNames").unwrap(); - // UEB seems to think "Sin" (etc) is used for "sin", so we move to lower case - return names.contains(&base_name.to_ascii_lowercase()); - }); + // names that are always function names (e.g, "sin" and "log") + let names = definitions.get_hashset("TrigFunctionNames").unwrap(); + // UEB seems to think "Sin" (etc) is used for "sin", so we move to lower case + return names.contains(&base_name.to_ascii_lowercase()); } } @@ -3945,7 +3947,7 @@ impl CanonicalizeContext { +/- are treated as nary operators and don't push/pop in those cases. consecutive operands such as nary times are also considered n-ary operators and don't push/pop in those cases. */ - fn canonicalize_mrows_in_mrow<'a>(&self, mrow: Element<'a>) -> Result> { + fn canonicalize_mrows_in_mrow<'a>(&self, definitions: &Definitions, mrow: Element<'a>) -> Result> { let is_ok_to_merge_child = mrow.children().len() != 1 || CanonicalizeContext::is_ok_to_merge_mrow_child(mrow); let saved_mrow_attrs = mrow.attributes(); assert_eq!(name(mrow), "mrow"); @@ -3959,7 +3961,7 @@ impl CanonicalizeContext { for i_child in 0..num_children { // debug!("\nDealing with child #{}: {}", i_child, mml_to_string(as_element(children[i_child]))); - let mut current_child = self.canonicalize_mrows(as_element(children[i_child]))?; + let mut current_child = self.canonicalize_mrows(definitions, as_element(children[i_child]))?; children[i_child] = ChildOfElement::Element( current_child ); let base_of_child = get_possible_embellished_node(current_child); let acts_as_ch = current_child.attribute_value(ACT_AS_OPERATOR); @@ -3976,19 +3978,20 @@ impl CanonicalizeContext { temp_mo.set_text(acts_as_ch); current_op = OperatorPair{ ch: acts_as_ch, - op: CanonicalizeContext::find_operator(Some(self), temp_mo, previous_op, + op: CanonicalizeContext::find_operator(Some((self, definitions)), temp_mo, previous_op, top(&parse_stack).last_child_in_mrow(), next_node) }; } else { current_op = OperatorPair{ ch: as_text(base_of_child), - op: CanonicalizeContext::find_operator(Some(self), base_of_child, previous_op, + op: CanonicalizeContext::find_operator(Some((self, definitions)), base_of_child, previous_op, top(&parse_stack).last_child_in_mrow(), next_node) }; // deal with vertical bars which might be infix, open, or close fences // note: mrow shrinks as we iterate through it (removing children from it) current_op.op = self.determine_vertical_bar_op( + definitions, current_op.op, base_of_child, next_node, @@ -4002,7 +4005,7 @@ impl CanonicalizeContext { let base_of_previous_child = get_possible_embellished_node(previous_child); let acts_as_ch = previous_child.attribute_value(ACT_AS_OPERATOR); if name(base_of_previous_child) != "mo" && acts_as_ch.is_none() { - let likely_function_name = self.is_function_name(previous_child, Some(&children[i_child..])); + let likely_function_name = self.is_function_name(definitions, previous_child, Some(&children[i_child..])); if name(base_of_child) == "mtext" && as_text(base_of_child) == "\u{00A0}" { base_of_child.set_attribute_value("data-function-likelihood", &(likely_function_name == FunctionNameCertainty::True).to_string()); base_of_child.remove_attribute("data-was-mo"); @@ -4015,7 +4018,7 @@ impl CanonicalizeContext { // consecutive operands -- add an invisible operator as appropriate current_op = if likely_function_name == FunctionNameCertainty::True { OperatorPair{ ch: "\u{2061}", op: &INVISIBLE_FUNCTION_APPLICATION } - } else if self.is_mixed_fraction(previous_child, &children[i_child..])? { + } else if self.is_mixed_fraction(definitions, previous_child, &children[i_child..])? { OperatorPair{ ch: "\u{2064}", op: &IMPLIED_INVISIBLE_PLUS } } else if self.is_implied_comma(previous_child, current_child, mrow) { OperatorPair{ch: "\u{2063}", op: &IMPLIED_INVISIBLE_COMMA } @@ -4023,7 +4026,7 @@ impl CanonicalizeContext { OperatorPair{ch: "\u{2063}", op: &IMPLIED_CHEMICAL_BOND } } else if self.is_implied_separator(previous_child, current_child) { OperatorPair{ch: "\u{2063}", op: &IMPLIED_SEPARATOR_HIGH_PRIORITY } - } else if self.is_trig_arg(base_of_previous_child, base_of_child, &mut parse_stack) { + } else if self.is_trig_arg(definitions, base_of_previous_child, base_of_child, &mut parse_stack) { OperatorPair{ch: "\u{2062}", op: &IMPLIED_TIMES_HIGH_PRIORITY } } else { OperatorPair{ ch: "\u{2062}", op: &IMPLIED_TIMES } @@ -4045,7 +4048,7 @@ impl CanonicalizeContext { } // debug!(" Found implicit op {}/{} [{:?}]", show_invisible_op_char(current_op.ch), current_op.op.priority, likely_function_name); self.reduce_stack(&mut parse_stack, current_op.op.priority); - let shift_result = self.shift_stack(&mut parse_stack, implied_mo, current_op.clone()); + let shift_result = self.shift_stack(definitions, &mut parse_stack, implied_mo, current_op.clone()); // ignore shift_result.0 which is just 'implied_mo' assert_eq!(implied_mo, shift_result.0); assert!( ptr_eq(current_op.op, shift_result.1.op) ); @@ -4063,7 +4066,7 @@ impl CanonicalizeContext { if top(&parse_stack).is_operand { // will end up with operand operand -- need to choose operator associated with prev child // we use the original input here because in this case, we need to look to the right of the ()s to deal with chemical states - let likely_function_name = self.is_function_name(as_element(children[i_child-1]), Some(&children[i_child..])); + let likely_function_name = self.is_function_name(definitions, as_element(children[i_child-1]), Some(&children[i_child..])); let implied_operator = if likely_function_name== FunctionNameCertainty::True { OperatorPair{ ch: "\u{2061}", op: &INVISIBLE_FUNCTION_APPLICATION } } else { @@ -4075,7 +4078,8 @@ impl CanonicalizeContext { if likely_function_name == FunctionNameCertainty::Maybe { implied_mo.set_attribute_value("data-function-guess", "true"); } - self.reduce_stack(&mut parse_stack, implied_operator.op.priority); let shift_result = self.shift_stack(&mut parse_stack, implied_mo, implied_operator.clone()); + self.reduce_stack(&mut parse_stack, implied_operator.op.priority); + let shift_result = self.shift_stack(definitions, &mut parse_stack, implied_mo, implied_operator.clone()); // ignore shift_result.0 which is just 'implied_mo' assert_eq!(implied_mo, shift_result.0); assert!( ptr_eq(implied_operator.op, shift_result.1.op) ); @@ -4095,7 +4099,7 @@ impl CanonicalizeContext { } self.reduce_stack(&mut parse_stack, current_op.op.priority); // push new operator on stack (already handled n-ary case) - let shift_result = self.shift_stack(&mut parse_stack, current_child, current_op); + let shift_result = self.shift_stack(definitions, &mut parse_stack, current_child, current_op); current_child = shift_result.0; current_op = shift_result.1; } @@ -4232,6 +4236,8 @@ fn show_invisible_op_char(ch: &str) -> &str { #[cfg(test)] mod canonicalize_tests { use crate::are_strs_canonically_equal_with_locale; + use crate::definitions::SPEECH_DEFINITIONS; + #[allow(unused_imports)] use super::super::init_logger; @@ -4392,12 +4398,15 @@ mod canonicalize_tests { #[test] fn illegal_mathml_element() { - use crate::interface::*; + use crate::canonicalize::CanonicalizeContext; + use crate::element_util::{get_element, trim_element}; let test_str = "f"; let package1 = &parser::parse(test_str).expect("Failed to parse test input"); let mathml = get_element(package1); trim_element(mathml, false); - assert!(canonicalize(mathml).is_err()); + + let canonicalize_context = CanonicalizeContext::new_from_global_prefs_cached(); + assert!(SPEECH_DEFINITIONS.with_borrow(|definitions| canonicalize_context.canonicalize(definitions, mathml).is_err())); } @@ -4495,9 +4504,9 @@ mod canonicalize_tests { #[test] fn mrow_with_intent_and_single_child() { - use crate::interface::*; + use crate::element_util::{get_element, trim_element}; use sxd_document::parser; - use crate::canonicalize::canonicalize; + use crate::canonicalize::CanonicalizeContext; // this forces initialization crate::interface::set_rules_dir(abs_rules_dir_path()).unwrap(); crate::speech::SPEECH_RULES.with(|_| true); @@ -4508,7 +4517,10 @@ mod canonicalize_tests { let package1 = &parser::parse(test).expect("Failed to parse test input"); let mathml = get_element(package1); trim_element(mathml, false); - let mathml_test = canonicalize(mathml).unwrap(); + + let canonicalize_context = CanonicalizeContext::new_from_global_prefs_cached(); + let mathml_test = SPEECH_DEFINITIONS.with_borrow(|definitions| canonicalize_context.canonicalize(definitions, mathml)).unwrap(); + let first_child = as_element( mathml_test.children()[0] ); assert_eq!(name(first_child), "mrow"); assert_eq!(first_child.children().len(), 1); @@ -4519,9 +4531,9 @@ mod canonicalize_tests { #[test] fn empty_mrow_with_intent() { // we don't want to remove the mrow because the intent on the mi would reference itself - use crate::interface::*; + use crate::element_util::{get_element, trim_element}; use sxd_document::parser; - use crate::canonicalize::canonicalize; + use crate::canonicalize::CanonicalizeContext; // this forces initialization crate::interface::set_rules_dir(abs_rules_dir_path()).unwrap(); crate::speech::SPEECH_RULES.with(|_| true); @@ -4532,7 +4544,10 @@ mod canonicalize_tests { let package1 = &parser::parse(test).expect("Failed to parse test input"); let mathml = get_element(package1); trim_element(mathml, false); - let mathml_test = canonicalize(mathml).unwrap(); + + let canonicalize_context = CanonicalizeContext::new_from_global_prefs_cached(); + let mathml_test = SPEECH_DEFINITIONS.with_borrow(|definitions| canonicalize_context.canonicalize(definitions, mathml)).unwrap(); + let first_child = as_element( mathml_test.children()[0] ); assert_eq!(name(first_child), "mrow"); assert_eq!(first_child.children().len(), 1); @@ -5318,7 +5333,7 @@ mod canonicalize_tests {   -   +   "; assert!(are_strs_canonically_equal(test_str, target_str)); @@ -6204,4 +6219,3 @@ mod canonicalize_tests { assert!(are_strs_canonically_equal(test_str, target_str)); } } - diff --git a/src/chemistry.rs b/src/chemistry.rs index 503f5013..b582b405 100644 --- a/src/chemistry.rs +++ b/src/chemistry.rs @@ -1818,7 +1818,7 @@ mod chem_tests { fn parse_mathml_string(test: &str, test_mathml: F) -> bool where F: Fn(Element) -> bool { use sxd_document::parser; - use crate::interface::{get_element, trim_element}; + use crate::element_util::{get_element, trim_element}; let new_package = parser::parse(&test); if let Err(e) = new_package { diff --git a/src/definitions.rs b/src/definitions.rs index 5d1e7725..ca063045 100644 --- a/src/definitions.rs +++ b/src/definitions.rs @@ -28,8 +28,7 @@ extern crate yaml_rust; use yaml_rust::yaml::Hash; use yaml_rust::Yaml; use crate::errors::*; -use crate::prefs::*; -use std::{cell::RefCell, cell::Ref, cell::RefMut, collections::HashSet, rc::Rc}; +use std::{cell::RefCell, cell::Ref, collections::HashSet, rc::Rc}; use std::{collections::HashMap, path::Path, path::PathBuf}; use crate::shim_filesystem::read_to_string_shim; @@ -128,33 +127,24 @@ thread_local!{ /// See [`Definitions`] for more details. pub static SPEECH_DEFINITIONS: RefCell = RefCell::new( Definitions::new() ); pub static BRAILLE_DEFINITIONS: RefCell = RefCell::new( Definitions::new() ); - pub static DEFINITIONS: &'static std::thread::LocalKey> = const { &SPEECH_DEFINITIONS }; } /// Reads the `definitions.yaml` files specified by current_files -- these are presumed to need updating. /// /// If there is a failure during read, the error is propagated to the caller -pub fn read_definitions_file(use_speech_defs: bool) -> Result> { +pub fn read_definitions_file(file_path: &Path, definitions: &mut Definitions) -> Result> { // for each file in `locations`, read the contents and process them - let pref_manager = PreferenceManager::get(); - let pref_manager = pref_manager.borrow(); - let file_path = pref_manager.get_definitions_file(use_speech_defs); - let definitions = if use_speech_defs {&SPEECH_DEFINITIONS} else {&BRAILLE_DEFINITIONS}; - definitions.with( |defs| defs.borrow_mut().name_to_var_mapping.clear() ); + definitions.name_to_var_mapping.clear(); let mut new_files = vec![file_path.to_path_buf()]; - let mut files_read = read_one_definitions_file(use_speech_defs, file_path).chain_err(|| format!("in file '{}", file_path.to_string_lossy()))?; + let mut files_read = read_one_definitions_file(file_path, definitions).chain_err(|| format!("in file '{}", file_path.to_string_lossy()))?; new_files.append(&mut files_read); // merge the contents of `TrigFunctions` into a set that contains all the function names (from `AdditionalFunctionNames`). - return definitions.with(|defs| { - let mut defs = defs.borrow_mut(); - make_all_set_references_valid(&mut defs); - return Ok(new_files); - }); - + make_all_set_references_valid(definitions); + return Ok(new_files); /// Make references to all used set be valid by creating empty sets if they weren't defined - fn make_all_set_references_valid(defs: &mut RefMut) { + fn make_all_set_references_valid(defs: &mut Definitions) { // FIX: this list is created by hand -- it would be better if there was a way to create the list Automatically // Note: "FunctionNames" is created in build_all_functions_set() if not already set let used_set_names = ["GeometryPrefixOperators", "LikelyFunctionNames", "TrigFunctionNames", "AdditionalFunctionNames", "Arrows", "GeometryShapes"]; @@ -171,7 +161,7 @@ pub fn read_definitions_file(use_speech_defs: bool) -> Result> { } /// merge "TrigFunctions" and "AdditionalFunctionNames" into a new set named "FunctionNames" - fn build_all_functions_set(defs: &mut RefMut) -> HashSet { + fn build_all_functions_set(defs: &Definitions) -> HashSet { let trig_functions = defs.get_hashset("TrigFunctionNames").unwrap(); let mut all_functions = defs.get_hashset("AdditionalFunctionNames").unwrap().clone(); for trig_name in trig_functions.iter() { @@ -182,7 +172,7 @@ pub fn read_definitions_file(use_speech_defs: bool) -> Result> { } use crate::speech::*; -fn read_one_definitions_file(use_speech_defs: bool, path: &Path) -> Result> { +fn read_one_definitions_file(path: &Path, definitions: &mut Definitions) -> Result> { // read in the file contents let definition_file_contents = read_to_string_shim(path) .chain_err(|| format!("trying to read {}", path.to_str().unwrap()))?; @@ -195,7 +185,7 @@ fn read_one_definitions_file(use_speech_defs: bool, path: &Path) -> Result Result Result>> { +fn build_values(definition: &Yaml, path: &Path, definitions: &mut Definitions) -> Result>> { // Rule::Definition let dictionary = crate::speech::as_hash_checked(definition)?; if dictionary.len()!=1 { @@ -222,7 +212,7 @@ fn build_values(definition: &Yaml, use_speech_defs: bool, path: &Path) -> Result let def_name = key.as_str().ok_or_else(|| format!("definition list name '{}' is not a string", yaml_to_type(key)))?; if def_name == "include" { let do_include_fn = |new_file: &Path| { - read_one_definitions_file(use_speech_defs, new_file) + read_one_definitions_file(new_file, definitions) }; let include_file_name = value.as_str().ok_or_else(|| format!("definition list include name '{}' is not a string", yaml_to_type(value)))?; return Ok( Some(crate::speech::process_include(path, include_file_name, do_include_fn)?) ); @@ -265,12 +255,8 @@ fn build_values(definition: &Yaml, use_speech_defs: bool, path: &Path) -> Result } }; - let definitions = if use_speech_defs {&SPEECH_DEFINITIONS} else {&BRAILLE_DEFINITIONS}; - return definitions.with(|definitions| { - let name_definition_map = &mut definitions.borrow_mut().name_to_var_mapping; - name_definition_map.insert(def_name.to_string(), result); - return Ok(None); - }); + definitions.name_to_var_mapping.insert(def_name.to_string(), result); + return Ok(None); fn get_vec_values(values: &Vec) -> Result> { let mut result = Vec::with_capacity(values.len()); @@ -325,7 +311,7 @@ mod tests { // Rule::DefinitionList //debug!("variable_def_list {} is\n{}", yaml_to_type(variable_def_list), yaml_to_string(variable_def_list, 0)); for variable_def in variable_def_list.as_vec().unwrap() { - if let Err(e) = build_values(variable_def, true, &Path::new("")) { + if let Err(e) = SPEECH_DEFINITIONS.with_borrow_mut(|defs| build_values(variable_def, &Path::new(""), defs)) { bail!("{}", crate::interface::errors_to_string(&e.chain_err(||format!("in file {:?}", numbers)))); } } @@ -351,7 +337,7 @@ mod tests { // Rule::DefinitionList //debug!("variable_def_list {} is\n{}", yaml_to_type(variable_def_list), yaml_to_string(variable_def_list, 0)); for variable_def in variable_def_list.as_vec().unwrap() { - if let Err(e) = build_values(variable_def, true, &Path::new("")) { + if let Err(e) = SPEECH_DEFINITIONS.with_borrow_mut(|defs| build_values(variable_def, &Path::new(""), defs)) { bail!("{}", crate::interface::errors_to_string(&e.chain_err(||format!("in file {:?}", likely_function_names)))); } } @@ -376,15 +362,14 @@ mod tests { // Rule::DefinitionList //debug!("variable_def_list {} is\n{}", yaml_to_type(variable_def_list), yaml_to_string(variable_def_list, 0)); for variable_def in variable_def_list.as_vec().unwrap() { - if let Err(e) = build_values(variable_def, true, &Path::new("")) { + if let Err(e) = SPEECH_DEFINITIONS.with_borrow_mut(|defs| build_values(variable_def, &Path::new(""), defs)) { bail!("{}", crate::interface::errors_to_string(&e.chain_err(||format!("in file {:?}", units)))); } } return Ok(vec![]); }; compile_rule(&units, defs_build_fn).unwrap(); - SPEECH_DEFINITIONS.with(|defs| { - let defs = defs.borrow(); + SPEECH_DEFINITIONS.with_borrow(|defs| { let names = defs.get_hashmap("Units"); assert!(names.is_some()); let names = names.unwrap(); @@ -394,4 +379,4 @@ mod tests { assert_eq!(names.get("xxx"), None); }); } -} \ No newline at end of file +} diff --git a/src/element_util.rs b/src/element_util.rs new file mode 100644 index 00000000..e0c7c4f8 --- /dev/null +++ b/src/element_util.rs @@ -0,0 +1,326 @@ +use crate::canonicalize::{as_element, name}; +use crate::errors::*; +use crate::logs::enable_logs; +use crate::xpath_functions::{is_leaf, IsNode}; +use lazy_static::lazy_static; +use regex::Regex; +use sxd_document::{Package}; +use sxd_document::dom::*; + +pub fn get_element(package: &Package) -> Element { + enable_logs(); + let doc = package.as_document(); + let mut result = None; + for root_child in doc.root().children() { + if let ChildOfRoot::Element(e) = root_child { + assert!(result.is_none()); + result = Some(e); + } + } + return result.unwrap(); +} + +/// returns Ok() if two Documents are equal or some info where they differ in the Err +// Not really meant to be public -- used by tests in some packages +#[allow(dead_code)] +pub fn is_same_element(e1: Element, e2: Element) -> Result<()> { + enable_logs(); + if name(e1) != name(e2) { + bail!("Names not the same: {}, {}", name(e1), name(e2)); + } + + // assume 'e' doesn't have element children until proven otherwise + // this means we keep Text children until we are proven they aren't needed + if e1.children().len() != e2.children().len() { + bail!( + "Children of {} have {} != {} children", + name(e1), + e1.children().len(), + e2.children().len() + ); + } + + if let Err(e) = attrs_are_same(e1.attributes(), e2.attributes()) { + bail!("In element {}, {}", name(e1), e); + } + + for (i, (c1, c2)) in e1.children().iter().zip(e2.children().iter()).enumerate() { + match c1 { + ChildOfElement::Element(child1) => { + if let ChildOfElement::Element(child2) = c2 { + is_same_element(*child1, *child2)?; + } else { + bail!("{} child #{}, first is element, second is something else", name(e1), i); + } + } + ChildOfElement::Comment(com1) => { + if let ChildOfElement::Comment(com2) = c2 { + if com1.text() != com2.text() { + bail!("{} child #{} -- comment text differs", name(e1), i); + } + } else { + bail!("{} child #{}, first is comment, second is something else", name(e1), i); + } + } + ChildOfElement::ProcessingInstruction(p1) => { + if let ChildOfElement::ProcessingInstruction(p2) = c2 { + if p1.target() != p2.target() || p1.value() != p2.value() { + bail!("{} child #{} -- processing instruction differs", name(e1), i); + } + } else { + bail!( + "{} child #{}, first is processing instruction, second is something else", + name(e1), + i + ); + } + } + ChildOfElement::Text(t1) => { + if let ChildOfElement::Text(t2) = c2 { + if t1.text() != t2.text() { + bail!("{} child #{} -- text differs", name(e1), i); + } + } else { + bail!("{} child #{}, first is text, second is something else", name(e1), i); + } + } + } + } + return Ok(()); + + /// compares attributes -- '==' didn't seems to work + fn attrs_are_same(attrs1: Vec, attrs2: Vec) -> Result<()> { + if attrs1.len() != attrs2.len() { + bail!("Attributes have different length: {:?} != {:?}", attrs1, attrs2); + } + // can't guarantee attrs are in the same order + for attr1 in attrs1 { + if let Some(found_attr2) = attrs2 + .iter() + .find(|&attr2| attr1.name().local_part() == attr2.name().local_part()) + { + if attr1.value() == found_attr2.value() { + continue; + } else { + bail!( + "Attribute named {} has differing values:\n '{}'\n '{}'", + attr1.name().local_part(), + attr1.value(), + found_attr2.value() + ); + } + } else { + bail!( + "Attribute name {} not in [{}]", + print_attr(&attr1), + print_attrs(&attrs2) + ); + } + } + return Ok(()); + + fn print_attr(attr: &Attribute) -> String { + return format!("@{}='{}'", attr.name().local_part(), attr.value()); + } + fn print_attrs(attrs: &[Attribute]) -> String { + return attrs.iter().map(print_attr).collect::>().join(", "); + } + } +} + +// used for testing trim +/// returns Ok() if two Documents are equal or some info where they differ in the Err +#[allow(dead_code)] +pub(crate) fn is_same_doc(doc1: &Document, doc2: &Document) -> Result<()> { + // assume 'e' doesn't have element children until proven otherwise + // this means we keep Text children until we are proven they aren't needed + if doc1.root().children().len() != doc2.root().children().len() { + bail!( + "Children of docs have {} != {} children", + doc1.root().children().len(), + doc2.root().children().len() + ); + } + + for (i, (c1, c2)) in doc1 + .root() + .children() + .iter() + .zip(doc2.root().children().iter()) + .enumerate() + { + match c1 { + ChildOfRoot::Element(e1) => { + if let ChildOfRoot::Element(e2) = c2 { + is_same_element(*e1, *e2)?; + } else { + bail!("child #{}, first is element, second is something else", i); + } + } + ChildOfRoot::Comment(com1) => { + if let ChildOfRoot::Comment(com2) = c2 { + if com1.text() != com2.text() { + bail!("child #{} -- comment text differs", i); + } + } else { + bail!("child #{}, first is comment, second is something else", i); + } + } + ChildOfRoot::ProcessingInstruction(p1) => { + if let ChildOfRoot::ProcessingInstruction(p2) = c2 { + if p1.target() != p2.target() || p1.value() != p2.value() { + bail!("child #{} -- processing instruction differs", i); + } + } else { + bail!( + "child #{}, first is processing instruction, second is something else", + i + ); + } + } + } + } + return Ok(()); +} + +/// Not really meant to be public -- used by tests in some packages +pub fn trim_element(e: Element, allow_structure_in_leaves: bool) { + // "this is text { + trim_element(c, allow_structure_in_leaves); + } + ChildOfElement::Text(t) => { + single_text += t.text(); + e.remove_child(child); + } + _ => { + e.remove_child(child); + } + } + } + + // CSS considers only space, tab, linefeed, and carriage return as collapsable whitespace + if !(is_leaf(e) || name(e) == "intent-literal" || single_text.is_empty()) { + // intent-literal comes from testing intent + // FIX: we have a problem -- what should happen??? + // FIX: For now, just keep the children and ignore the text and log an error -- shouldn't panic/crash + if !single_text.trim_matches(WHITESPACE).is_empty() { + error!( + "trim_element: both element and textual children which shouldn't happen -- ignoring text '{}'", + single_text + ); + } + return; + } + if e.children().is_empty() && !single_text.is_empty() { + // debug!("Combining text in {}: '{}' -> '{}'", e.name().local_part(), single_text, trimmed_text); + e.set_text(&WHITESPACE_MATCH.replace_all(&single_text, " ")); + } + + fn make_leaf_element(mathml_leaf: Element) { + // MathML leaves like really shouldn't have non-textual content, but you could have embedded HTML + // Here, we take convert them to leaves by grabbing up all the text and making that the content + // Potentially, we leave them and let (default) rules do something, but it makes other parts of the code + // messier because checking the text of a leaf becomes Option<&str> rather than just &str + let children = mathml_leaf.children(); + if children.is_empty() { + return; + } + + // gather up the text + let mut text = "".to_string(); + for child in children { + let child_text = match child { + ChildOfElement::Element(child) => { + if name(child) == "mglyph" { + child.attribute_value("alt").unwrap_or("").to_string() + } else { + gather_text(child) + } + } + ChildOfElement::Text(t) => { + // debug!("ChildOfElement::Text: '{}'", t.text()); + t.text().to_string() + } + _ => "".to_string(), + }; + if !child_text.is_empty() { + text += &child_text; + } + } + + // get rid of the old children and replace with the text we just built + mathml_leaf.clear_children(); + mathml_leaf.set_text(WHITESPACE_MATCH.replace_all(&text, " ").trim_matches(WHITESPACE)); + // debug!("make_leaf_element: text is '{}'", crate::canonicalize::as_text(mathml_leaf)); + + /// gather up all the contents of the element and return them with a leading space + fn gather_text(html: Element) -> String { + let mut text = "".to_string(); // since we are throwing out the element tag, add a space between the contents + for child in html.children() { + match child { + ChildOfElement::Element(child) => { + text += &gather_text(child); + } + ChildOfElement::Text(t) => text += t.text(), + _ => (), + } + } + // debug!("gather_text: '{}'", text); + return text; + } + } +} + +pub(crate) fn add_ids(mathml: Element) -> Element { + use std::time::SystemTime; + let time = if cfg!(target_family = "wasm") { + fastrand::usize(..) + } else { + SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap() + .as_millis() as usize + }; + let time_part = radix_fmt::radix(time, 36).to_string(); + let random_part = radix_fmt::radix(fastrand::u32(..), 36).to_string(); + let prefix = "M".to_string() + &time_part[time_part.len() - 3..] + &random_part[random_part.len() - 4..] + "-"; // begin with letter + add_ids_to_all(mathml, &prefix, 0); + return mathml; + + fn add_ids_to_all(mathml: Element, id_prefix: &str, count: usize) -> usize { + let mut count = count; + if mathml.attribute("id").is_none() { + mathml.set_attribute_value("id", (id_prefix.to_string() + &count.to_string()).as_str()); + mathml.set_attribute_value("data-id-added", "true"); + count += 1; + }; + + if crate::xpath_functions::is_leaf(mathml) { + return count; + } + + for child in mathml.children() { + let child = as_element(child); + count = add_ids_to_all(child, id_prefix, count); + } + return count; + } +} diff --git a/src/infer_intent.rs b/src/infer_intent.rs index 0248778e..72142e9d 100644 --- a/src/infer_intent.rs +++ b/src/infer_intent.rs @@ -611,7 +611,8 @@ mod tests { fn test_intent(mathml: &str, target: &str, intent_error_recovery: &str) -> bool { - use crate::interface::*; + use crate::interface::set_preference; + use crate::element_util::{get_element, trim_element, is_same_element}; // this forces initialization crate::interface::set_rules_dir(super::super::abs_rules_dir_path()).unwrap(); // crate::speech::SpeechRules::initialize_all_rules().unwrap(); diff --git a/src/interface.rs b/src/interface.rs index b9a99c40..4e068c24 100644 --- a/src/interface.rs +++ b/src/interface.rs @@ -2,55 +2,18 @@ //! #![allow(non_snake_case)] #![allow(clippy::needless_return)] -use std::cell::RefCell; - +use crate::canonicalize::{as_element, name, CanonicalizeContext}; use crate::canonicalize::{as_text, create_mathml_element}; +use crate::element_util::{get_element, trim_element}; use crate::errors::*; -use phf::phf_map; -use regex::{Captures, Regex}; -use sxd_document::dom::*; -use sxd_document::parser; -use sxd_document::Package; - -use crate::canonicalize::{as_element, name}; - +use crate::logs::enable_logs; use crate::navigate::*; use crate::pretty_print::mml_to_string; -use crate::xpath_functions::{is_leaf, IsNode}; - -#[cfg(feature = "enable-logs")] -use std::sync::Once; -#[cfg(feature = "enable-logs")] -static INIT: Once = Once::new(); - -fn enable_logs() { - #[cfg(feature = "enable-logs")] - INIT.call_once(||{ - #[cfg(target_os = "android")] - { - extern crate log; - extern crate android_logger; - - use log::*; - use android_logger::*; - - android_logger::init_once( - Config::default() - .with_max_level(LevelFilter::Trace) - .with_tag("MathCat") - ); - trace!("Activated Android logger!"); - } - }); -} - -// wrap up some common functionality between the call from 'main' and AT -fn cleanup_mathml(mathml: Element) -> Result { - trim_element(mathml, false); - let mathml = crate::canonicalize::canonicalize(mathml)?; - let mathml = add_ids(mathml); - return Ok(mathml); -} +use crate::stateless_interface::{cleanup_mathml, create_mathml_instance_with_text}; +use crate::xpath_functions::is_leaf; +use std::cell::RefCell; +use sxd_document::{parser, Package}; +use sxd_document::dom::*; thread_local! { /// The current node being navigated (also spoken and brailled) is stored in `MATHML_INSTANCE`. @@ -93,14 +56,6 @@ pub fn get_version() -> String { /// The ids can be used for sync highlighting if the `Bookmark` API preference is true. pub fn set_mathml(mathml_str: String) -> Result { enable_logs(); - lazy_static! { - // if these are present when resent to MathJaX, MathJaX crashes (https://github.com/mathjax/MathJax/issues/2822) - static ref MATHJAX_V2: Regex = Regex::new(r#"class *= *['"]MJX-.*?['"]"#).unwrap(); - static ref MATHJAX_V3: Regex = Regex::new(r#"class *= *['"]data-mjx-.*?['"]"#).unwrap(); - static ref NAMESPACE_DECL: Regex = Regex::new(r#"xmlns:[[:alpha:]]+"#).unwrap(); // very limited namespace prefix match - static ref PREFIX: Regex = Regex::new(r#"( Result { // We need the main definitions files to be read in so canonicalize can work. // This call reads all of them for the current preferences, but that's ok since they will likely be used - crate::speech::SPEECH_RULES.with(|rules| rules.borrow_mut().read_files())?; + crate::speech::SPEECH_RULES.with_borrow_mut(|rules| rules.read_files())?; return MATHML_INSTANCE.with(|old_package| { - static HTML_ENTITIES_MAPPING: phf::Map<&str, &str> = include!("entities.in"); - - let mut error_message = "".to_string(); // can't return a result inside the replace_all, so we do this hack of setting the message and then returning the error - // need to deal with character data and convert to something the parser knows - let mathml_str = - HTML_ENTITIES.replace_all(&mathml_str, |cap: &Captures| match HTML_ENTITIES_MAPPING.get(&cap[1]) { - None => { - error_message = format!("No entity named '{}'", &cap[0]); - cap[0].to_string() - } - Some(&ch) => ch.to_string(), - }); - - if !error_message.is_empty() { - bail!(error_message); - } - let mathml_str = MATHJAX_V2.replace_all(&mathml_str, ""); - let mathml_str = MATHJAX_V3.replace_all(&mathml_str, ""); - - // the speech rules use the xpath "name" function and that includes the prefix - // getting rid of the prefix properly probably involves a recursive replacement in the tree - // if the prefix is used, it is almost certainly something like "m" or "mml", so this cheat will work. - let mathml_str = NAMESPACE_DECL.replace(&mathml_str, "xmlns"); // do this before the PREFIX replace! - let mathml_str = PREFIX.replace_all(&mathml_str, "$1"); - - let new_package = parser::parse(&mathml_str); - if let Err(e) = new_package { - bail!("Invalid MathML input:\n{}\nError is: {}", &mathml_str, &e.to_string()); - } - - let new_package = new_package.unwrap(); - let mathml = get_element(&new_package); - let mathml = cleanup_mathml(mathml)?; - let mathml_string = mml_to_string(mathml); + let (new_package, canonicalized_mathml) = + crate::definitions::SPEECH_DEFINITIONS.with_borrow(|definitions| + create_mathml_instance_with_text(&CanonicalizeContext::new_from_global_prefs_cached(), definitions, &mathml_str) + )?; old_package.replace(new_package); - - return Ok(mathml_string); + return Ok(canonicalized_mathml); }); } @@ -540,61 +464,14 @@ pub fn errors_to_string(e: &Error) -> String { return result; } -fn add_ids(mathml: Element) -> Element { - use std::time::SystemTime; - let time = if cfg!(target_family = "wasm") { - fastrand::usize(..) - } else { - SystemTime::now() - .duration_since(SystemTime::UNIX_EPOCH) - .unwrap() - .as_millis() as usize - }; - let time_part = radix_fmt::radix(time, 36).to_string(); - let random_part = radix_fmt::radix(fastrand::u32(..), 36).to_string(); - let prefix = "M".to_string() + &time_part[time_part.len() - 3..] + &random_part[random_part.len() - 4..] + "-"; // begin with letter - add_ids_to_all(mathml, &prefix, 0); - return mathml; - - fn add_ids_to_all(mathml: Element, id_prefix: &str, count: usize) -> usize { - let mut count = count; - if mathml.attribute("id").is_none() { - mathml.set_attribute_value("id", (id_prefix.to_string() + &count.to_string()).as_str()); - mathml.set_attribute_value("data-id-added", "true"); - count += 1; - }; - - if crate::xpath_functions::is_leaf(mathml) { - return count; - } - - for child in mathml.children() { - let child = as_element(child); - count = add_ids_to_all(child, id_prefix, count); - } - return count; - } -} - -pub fn get_element(package: &Package) -> Element { - enable_logs(); - let doc = package.as_document(); - let mut result = None; - for root_child in doc.root().children() { - if let ChildOfRoot::Element(e) = root_child { - assert!(result.is_none()); - result = Some(e); - } - } - return result.unwrap(); -} - /// Get the intent after setting the MathML /// Used in testing #[allow(dead_code)] pub fn get_intent<'a>(mathml: Element<'a>, doc: Document<'a>) -> Result> { crate::speech::SPEECH_RULES.with(|rules| rules.borrow_mut().read_files().unwrap()); - let mathml = cleanup_mathml(mathml)?; + let mathml = crate::definitions::SPEECH_DEFINITIONS.with_borrow(|definitions| + cleanup_mathml(&CanonicalizeContext::new_from_global_prefs_cached(), definitions, mathml) + )?; return crate::speech::intent_from_mathml(mathml, doc); } @@ -609,280 +486,12 @@ fn trim_doc(doc: &Document) { } } -/// Not really meant to be public -- used by tests in some packages -pub fn trim_element(e: Element, allow_structure_in_leaves: bool) { - // "this is text { - trim_element(c, allow_structure_in_leaves); - } - ChildOfElement::Text(t) => { - single_text += t.text(); - e.remove_child(child); - } - _ => { - e.remove_child(child); - } - } - } - - // CSS considers only space, tab, linefeed, and carriage return as collapsable whitespace - if !(is_leaf(e) || name(e) == "intent-literal" || single_text.is_empty()) { - // intent-literal comes from testing intent - // FIX: we have a problem -- what should happen??? - // FIX: For now, just keep the children and ignore the text and log an error -- shouldn't panic/crash - if !single_text.trim_matches(WHITESPACE).is_empty() { - error!( - "trim_element: both element and textual children which shouldn't happen -- ignoring text '{}'", - single_text - ); - } - return; - } - if e.children().is_empty() && !single_text.is_empty() { - // debug!("Combining text in {}: '{}' -> '{}'", e.name().local_part(), single_text, trimmed_text); - e.set_text(&WHITESPACE_MATCH.replace_all(&single_text, " ")); - } - - fn make_leaf_element(mathml_leaf: Element) { - // MathML leaves like really shouldn't have non-textual content, but you could have embedded HTML - // Here, we take convert them to leaves by grabbing up all the text and making that the content - // Potentially, we leave them and let (default) rules do something, but it makes other parts of the code - // messier because checking the text of a leaf becomes Option<&str> rather than just &str - let children = mathml_leaf.children(); - if children.is_empty() { - return; - } - - // gather up the text - let mut text = "".to_string(); - for child in children { - let child_text = match child { - ChildOfElement::Element(child) => { - if name(child) == "mglyph" { - child.attribute_value("alt").unwrap_or("").to_string() - } else { - gather_text(child) - } - } - ChildOfElement::Text(t) => { - // debug!("ChildOfElement::Text: '{}'", t.text()); - t.text().to_string() - } - _ => "".to_string(), - }; - if !child_text.is_empty() { - text += &child_text; - } - } - - // get rid of the old children and replace with the text we just built - mathml_leaf.clear_children(); - mathml_leaf.set_text(WHITESPACE_MATCH.replace_all(&text, " ").trim_matches(WHITESPACE)); - // debug!("make_leaf_element: text is '{}'", crate::canonicalize::as_text(mathml_leaf)); - - /// gather up all the contents of the element and return them with a leading space - fn gather_text(html: Element) -> String { - let mut text = "".to_string(); // since we are throwing out the element tag, add a space between the contents - for child in html.children() { - match child { - ChildOfElement::Element(child) => { - text += &gather_text(child); - } - ChildOfElement::Text(t) => text += t.text(), - _ => (), - } - } - // debug!("gather_text: '{}'", text); - return text; - } - } -} - -// used for testing trim -/// returns Ok() if two Documents are equal or some info where they differ in the Err -#[allow(dead_code)] -fn is_same_doc(doc1: &Document, doc2: &Document) -> Result<()> { - // assume 'e' doesn't have element children until proven otherwise - // this means we keep Text children until we are proven they aren't needed - if doc1.root().children().len() != doc2.root().children().len() { - bail!( - "Children of docs have {} != {} children", - doc1.root().children().len(), - doc2.root().children().len() - ); - } - - for (i, (c1, c2)) in doc1 - .root() - .children() - .iter() - .zip(doc2.root().children().iter()) - .enumerate() - { - match c1 { - ChildOfRoot::Element(e1) => { - if let ChildOfRoot::Element(e2) = c2 { - is_same_element(*e1, *e2)?; - } else { - bail!("child #{}, first is element, second is something else", i); - } - } - ChildOfRoot::Comment(com1) => { - if let ChildOfRoot::Comment(com2) = c2 { - if com1.text() != com2.text() { - bail!("child #{} -- comment text differs", i); - } - } else { - bail!("child #{}, first is comment, second is something else", i); - } - } - ChildOfRoot::ProcessingInstruction(p1) => { - if let ChildOfRoot::ProcessingInstruction(p2) = c2 { - if p1.target() != p2.target() || p1.value() != p2.value() { - bail!("child #{} -- processing instruction differs", i); - } - } else { - bail!( - "child #{}, first is processing instruction, second is something else", - i - ); - } - } - } - } - return Ok(()); -} - -/// returns Ok() if two Documents are equal or some info where they differ in the Err -// Not really meant to be public -- used by tests in some packages -#[allow(dead_code)] -pub fn is_same_element(e1: Element, e2: Element) -> Result<()> { - enable_logs(); - if name(e1) != name(e2) { - bail!("Names not the same: {}, {}", name(e1), name(e2)); - } - - // assume 'e' doesn't have element children until proven otherwise - // this means we keep Text children until we are proven they aren't needed - if e1.children().len() != e2.children().len() { - bail!( - "Children of {} have {} != {} children", - name(e1), - e1.children().len(), - e2.children().len() - ); - } - - if let Err(e) = attrs_are_same(e1.attributes(), e2.attributes()) { - bail!("In element {}, {}", name(e1), e); - } - - for (i, (c1, c2)) in e1.children().iter().zip(e2.children().iter()).enumerate() { - match c1 { - ChildOfElement::Element(child1) => { - if let ChildOfElement::Element(child2) = c2 { - is_same_element(*child1, *child2)?; - } else { - bail!("{} child #{}, first is element, second is something else", name(e1), i); - } - } - ChildOfElement::Comment(com1) => { - if let ChildOfElement::Comment(com2) = c2 { - if com1.text() != com2.text() { - bail!("{} child #{} -- comment text differs", name(e1), i); - } - } else { - bail!("{} child #{}, first is comment, second is something else", name(e1), i); - } - } - ChildOfElement::ProcessingInstruction(p1) => { - if let ChildOfElement::ProcessingInstruction(p2) = c2 { - if p1.target() != p2.target() || p1.value() != p2.value() { - bail!("{} child #{} -- processing instruction differs", name(e1), i); - } - } else { - bail!( - "{} child #{}, first is processing instruction, second is something else", - name(e1), - i - ); - } - } - ChildOfElement::Text(t1) => { - if let ChildOfElement::Text(t2) = c2 { - if t1.text() != t2.text() { - bail!("{} child #{} -- text differs", name(e1), i); - } - } else { - bail!("{} child #{}, first is text, second is something else", name(e1), i); - } - } - } - } - return Ok(()); - - /// compares attributes -- '==' didn't seems to work - fn attrs_are_same(attrs1: Vec, attrs2: Vec) -> Result<()> { - if attrs1.len() != attrs2.len() { - bail!("Attributes have different length: {:?} != {:?}", attrs1, attrs2); - } - // can't guarantee attrs are in the same order - for attr1 in attrs1 { - if let Some(found_attr2) = attrs2 - .iter() - .find(|&attr2| attr1.name().local_part() == attr2.name().local_part()) - { - if attr1.value() == found_attr2.value() { - continue; - } else { - bail!( - "Attribute named {} has differing values:\n '{}'\n '{}'", - attr1.name().local_part(), - attr1.value(), - found_attr2.value() - ); - } - } else { - bail!( - "Attribute name {} not in [{}]", - print_attr(&attr1), - print_attrs(&attrs2) - ); - } - } - return Ok(()); - - fn print_attr(attr: &Attribute) -> String { - return format!("@{}='{}'", attr.name().local_part(), attr.value()); - } - fn print_attrs(attrs: &[Attribute]) -> String { - return attrs.iter().map(print_attr).collect::>().join(", "); - } - } -} - #[cfg(test)] mod tests { #[allow(unused_imports)] use super::super::init_logger; use super::*; + use crate::element_util::is_same_doc; fn are_parsed_strs_equal(test: &str, target: &str) -> bool { let target_package = &parser::parse(target).expect("Failed to parse input"); @@ -976,6 +585,8 @@ mod tests { #[test] fn test_entities() { + use regex::Regex; + // this forces initialization set_rules_dir(super::super::abs_rules_dir_path()).unwrap(); diff --git a/src/lib.rs b/src/lib.rs index 8ef01336..ca7c1efa 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -46,13 +46,17 @@ extern crate cfg_if; pub mod interface; +pub mod stateless_interface; + #[cfg(feature = "include-zip")] pub use shim_filesystem::ZIPPED_RULE_FILES; mod canonicalize; +pub mod element_util; mod infer_intent; pub mod speech; mod braille; +mod logs; mod navigate; mod prefs; mod tts; @@ -63,6 +67,7 @@ mod chemistry; pub mod shim_filesystem; // really just for override_file_for_debugging_rules, but the config seems to throw it off pub use interface::*; +pub use stateless_interface::*; #[cfg(test)] pub fn init_logger() { @@ -90,9 +95,9 @@ pub fn abs_rules_dir_path() -> String { #[cfg(test)] pub fn are_strs_canonically_equal_with_locale(test: &str, target: &str, block_separators: &str, decimal_separators: &str) -> bool { - use crate::{interface::*, pretty_print::mml_to_string}; + use crate::{canonicalize::CanonicalizeContext, element_util::*, interface::*, pretty_print::mml_to_string}; + use crate::definitions::SPEECH_DEFINITIONS; use sxd_document::parser; - use crate::canonicalize::canonicalize; // this forces initialization crate::interface::set_rules_dir(abs_rules_dir_path()).unwrap(); crate::speech::SPEECH_RULES.with(|rules| rules.borrow_mut().read_files().unwrap()); @@ -104,7 +109,9 @@ pub fn are_strs_canonically_equal_with_locale(test: &str, target: &str, block_se let mathml = get_element(package1); trim_element(mathml, false); // debug!("test:\n{}", mml_to_string(mathml)); - let mathml_test = canonicalize(mathml).unwrap(); + + let canonicalize_context = CanonicalizeContext::new_from_global_prefs_cached(); + let mathml_test = SPEECH_DEFINITIONS.with_borrow(|definitions| canonicalize_context.canonicalize(definitions, mathml)).unwrap(); let package2 = &parser::parse(target).expect("Failed to parse target input"); let mathml_target = get_element(package2); diff --git a/src/logs.rs b/src/logs.rs new file mode 100644 index 00000000..c6058146 --- /dev/null +++ b/src/logs.rs @@ -0,0 +1,25 @@ +#[cfg(feature = "enable-logs")] +use std::sync::Once; +#[cfg(feature = "enable-logs")] +static INIT: Once = Once::new(); + +pub(crate) fn enable_logs() { + #[cfg(feature = "enable-logs")] + INIT.call_once(||{ + #[cfg(target_os = "android")] + { + extern crate log; + extern crate android_logger; + + use log::*; + use android_logger::*; + + android_logger::init_once( + Config::default() + .with_max_level(LevelFilter::Trace) + .with_tag("MathCat") + ); + trace!("Activated Android logger!"); + } + }); +} diff --git a/src/navigate.rs b/src/navigate.rs index 5573737f..5641a40b 100644 --- a/src/navigate.rs +++ b/src/navigate.rs @@ -2,7 +2,7 @@ //! See preference documentation for more info on navigation preferences. #![allow(clippy::needless_return)] -use std::cell::{Ref, RefCell, RefMut}; +use std::cell::{RefCell, RefMut}; use sxd_xpath::{Context, Factory, Value}; use sxd_document::dom::Element; use sxd_document::Package; @@ -18,6 +18,8 @@ use std::time::Instant; use crate::errors::*; use phf::phf_set; +#[cfg(test)] +use crate::element_util::{get_element}; const MAX_PLACE_MARKERS: usize = 10; @@ -349,9 +351,7 @@ fn do_navigate_command_and_param(mathml: Element, command: NavigationCommand, pa pub fn do_navigate_command_string(mathml: Element, nav_command: &'static str) -> Result { // first check to see if nav file has been changed -- don't bother checking in loop below - NAVIGATION_RULES.with(|rules| { - rules.borrow_mut().read_files() - })?; + NAVIGATION_RULES.with_borrow_mut(|rules| rules.read_files())?; if mathml.children().is_empty() { bail!("MathML has not been set -- can't navigate"); @@ -368,60 +368,62 @@ pub fn do_navigate_command_string(mathml: Element, nav_command: &'static str) -> }, "None") }; - return NAVIGATION_RULES.with(|rules| { - let rules = rules.borrow(); - let new_package = Package::new(); - let mut rules_with_context = SpeechRulesWithContext::new(&rules, new_package.as_document(), ""); - - nav_state.mode = rules.pref_manager.as_ref().borrow().pref_to_string("NavMode"); - nav_state.speak_overview = rules.pref_manager.as_ref().borrow().pref_to_string("Overview") == "true"; + return NAVIGATION_RULES.with_borrow(|rules| { + return crate::definitions::SPEECH_DEFINITIONS.with_borrow(|definitions| { + let new_package = Package::new(); + let mut rules_with_context = + SpeechRulesWithContext::new(&rules, definitions, new_package.as_document(), ""); - nav_state.init_navigation_context(rules_with_context.get_context(), nav_command, nav_state.top()); - - // start navigation off at the right node - if nav_command == "MoveLastLocation" { - nav_state.pop(); - } + nav_state.mode = rules.pref_manager.as_ref().borrow().pref_to_string("NavMode"); + nav_state.speak_overview = rules.pref_manager.as_ref().borrow().pref_to_string("Overview") == "true"; + + nav_state.init_navigation_context(rules_with_context.get_context(), nav_command, nav_state.top()); - // If no speech happened for some calls, we try the call again (e.g, no speech for invisible times). - // To prevent to infinite loop, we limit the number of tries - const LOOP_LIMIT: usize = 3; - let mut cumulative_speech = String::with_capacity(120); - for loop_count in 0..LOOP_LIMIT { - match apply_navigation_rules(mathml, nav_command, &rules, &mut rules_with_context, &mut nav_state, loop_count) { - Ok( (speech, done)) => { - cumulative_speech = cumulative_speech + if loop_count==0 {""} else {" "} + speech.trim(); - if done { - let (tts, rate) = { - let prefs = rules.pref_manager.borrow(); - (prefs.pref_to_string("TTS"), prefs.pref_to_string("MathRate")) - }; - if rate != "100" { - match tts.as_str() { - "SSML" => if !cumulative_speech.starts_with("{}", &rate, &cumulative_speech); - }, - "SAPI5" => if !cumulative_speech.starts_with("{}'>", - 10.0*(0.01*rate.parse::().unwrap_or(100.0)).log(3.0), cumulative_speech); - }, - _ => (), // do nothing + // start navigation off at the right node + if nav_command == "MoveLastLocation" { + nav_state.pop(); + } + + // If no speech happened for some calls, we try the call again (e.g, no speech for invisible times). + // To prevent to infinite loop, we limit the number of tries + const LOOP_LIMIT: usize = 3; + let mut cumulative_speech = String::with_capacity(120); + for loop_count in 0..LOOP_LIMIT { + match apply_navigation_rules(mathml, nav_command, &rules, &mut rules_with_context, &mut nav_state, loop_count) { + Ok( (speech, done)) => { + cumulative_speech = cumulative_speech + if loop_count==0 {""} else {" "} + speech.trim(); + if done { + let (tts, rate) = { + let prefs = rules.pref_manager.borrow(); + (prefs.pref_to_string("TTS"), prefs.pref_to_string("MathRate")) + }; + if rate != "100" { + match tts.as_str() { + "SSML" => if !cumulative_speech.starts_with("{}", &rate, &cumulative_speech); + }, + "SAPI5" => if !cumulative_speech.starts_with("{}'>", + 10.0*(0.01*rate.parse::().unwrap_or(100.0)).log(3.0), cumulative_speech); + }, + _ => (), // do nothing + } } + return Ok( rules.pref_manager.borrow().get_tts() + .merge_pauses(crate::speech::remove_optional_indicators( + &cumulative_speech.replace(CONCAT_STRING, "") + .replace(CONCAT_INDICATOR, "") + ) + .trim_start().trim_end_matches([' ', ',', ';'])) ); } - return Ok( rules.pref_manager.borrow().get_tts() - .merge_pauses(crate::speech::remove_optional_indicators( - &cumulative_speech.replace(CONCAT_STRING, "") - .replace(CONCAT_INDICATOR, "") - ) - .trim_start().trim_end_matches([' ', ',', ';'])) ); + }, + Err(e) => { + return Err(e); } - }, - Err(e) => { - return Err(e); } } - } - bail!("Internal error: Navigation exceeded limit of number of times no speech generated."); + bail!("Internal error: Navigation exceeded limit of number of times no speech generated."); + }); }); }); @@ -443,7 +445,7 @@ pub fn do_navigate_command_string(mathml: Element, nav_command: &'static str) -> fn apply_navigation_rules<'c, 'm:'c>(mathml: Element<'m>, nav_command: &'static str, - rules: &Ref, rules_with_context: &mut SpeechRulesWithContext<'c, '_, 'm>, nav_state: &mut RefMut, + rules: &SpeechRules, rules_with_context: &mut SpeechRulesWithContext<'c, '_, 'm>, nav_state: &mut RefMut, loop_count: usize) -> Result<(String, bool)> { let context = rules_with_context.get_context(); context.set_variable("MatchCounter", loop_count as f64); @@ -2198,4 +2200,4 @@ mod tests { }); } } -} \ No newline at end of file +} diff --git a/src/prefs.rs b/src/prefs.rs index 34fa5217..961c7f4a 100644 --- a/src/prefs.rs +++ b/src/prefs.rs @@ -216,10 +216,20 @@ thread_local!{ // Also note that if 'error' is not an empty string, SpeechRules can't work so using those requires a check. #[derive(Debug, Default)] pub struct PreferenceManager { - rules_dir: PathBuf, // full path to rules dir - error: String, // empty/default string if fields are set, otherwise error message - user_prefs: Preferences, // prefs that come from reading prefs.yaml (system and user locations) - api_prefs: Preferences, // prefs set by API calls (along with some defaults not in the user settings such as "pitch") + // Path to the rules directory. + rules_dir: PathBuf, + + // Stateful mode: empty/default string if fields are set, otherwise error message. + // Stateless mode: unused. + error: String, + + // Stateful mode: these prefs come from reading prefs.yaml (system and user locations) + // Stateless mode: set explicitly when building the context. + user_prefs: Preferences, + + // In stateful mode, prefs set by API calls (along with some defaults not in the user settings such as "pitch") + api_prefs: Preferences, + sys_prefs_file: Option, // the system prefs.yaml file user_prefs_file: Option, // the user prefs.yaml file intent: PathBuf, // the intent rule style file @@ -255,7 +265,7 @@ impl fmt::Display for PreferenceManager { } impl PreferenceManager { - /// Initialize (the) PreferenceManager (a global var). + /// Initialize a PreferenceManager. /// 'rules_dir' is the path to "Rules" unless the env var MathCATRulesDir is set /// /// If rules_dir is an empty PathBuf, the existing rules_dir is used (an error if it doesn't exist) @@ -272,7 +282,23 @@ impl PreferenceManager { return Ok( () ); } + pub fn initialize_for_stateless(&mut self, rules_dir: PathBuf) -> Result<()> { + // No need to canonicalize the path as we do not cache anything. + self.set_rules_dir(&rules_dir)?; + + // We don't want to read local preference files in the stateless version, + // so we don't call set_preference files here. + + let language = self.user_prefs.prefs.get("Language").unwrap_or(&DEFAULT_LANG).clone(); + let language = language.as_str().unwrap(); + self.set_separators(language)?; + + self.set_all_files(&rules_dir)?; + return Ok( () ); + } + + /// Returns the global preference manager. pub fn get() -> Rc> { return PREF_MANAGER.with( |pm| pm.clone() ); } @@ -836,6 +862,41 @@ impl PreferenceManager { } +// Builds a PreferenceManager for stateless mode. +pub(crate) struct PreferenceManagerBuilder { + pref_manager: Rc>, + rules_dir: PathBuf, +} + +impl PreferenceManagerBuilder { + pub fn new() -> PreferenceManagerBuilder { + let result = PreferenceManagerBuilder { + pref_manager: Rc::new(RefCell::new(PreferenceManager::default())), + rules_dir: PathBuf::new(), + }; + result.pref_manager.borrow_mut().user_prefs = Preferences::user_defaults(); + return result; + } + + // Sets the rules directory. + pub fn set_rules_dir(&mut self, path: &Path) { + self.rules_dir = path.into(); + } + + + pub fn set_string_pref(&mut self, key: &str, value: &str) { + self.pref_manager.borrow_mut().user_prefs.prefs.insert(key.to_string(), Yaml::String(value.to_string())); + } + + pub fn build(mut self) -> Result>> { + // We never read the local preference files in stateless mode. + self.set_string_pref("CheckRuleFiles", "None"); + + self.pref_manager.borrow_mut().initialize_for_stateless(self.rules_dir)?; + return Ok(self.pref_manager); + } +} + #[cfg(test)] mod tests { #[allow(unused_imports)] diff --git a/src/speech.rs b/src/speech.rs index 9aa4302b..a82deadd 100644 --- a/src/speech.rs +++ b/src/speech.rs @@ -13,7 +13,7 @@ use sxd_xpath::{Context, Factory, Value, XPath}; use sxd_xpath::nodeset::Node; use std::fmt; use std::time::SystemTime; -use crate::definitions::read_definitions_file; +use crate::definitions::{read_definitions_file, Definitions, SPEECH_DEFINITIONS}; use crate::errors::*; use crate::prefs::*; use yaml_rust::{YamlLoader, Yaml, yaml::Hash}; @@ -58,6 +58,11 @@ pub fn unquote_string(str: &str) -> &str { return &str[..str.len()-N_BYTES_NO_EVAL_QUOTE_CHAR]; } +pub fn intent_from_rules_and_mathml<'m>(rules: &SpeechRules, definitions: &Definitions, mathml: Element, doc: Document<'m>) -> Result> { + let intent_tree = apply_intent_to_mathml(rules, definitions, doc, mathml, "")?; + doc.root().append_child(intent_tree); + return Ok(intent_tree); +} /// The main external call, `intent_from_mathml` returns a string for the speech associated with the `mathml`. /// It matches against the rules that are computed by user prefs such as "Language" and "SpeechStyle". @@ -70,9 +75,12 @@ pub fn unquote_string(str: &str) -> &str { /// A string is returned in call cases. /// If there is an error, the speech string will indicate an error. pub fn intent_from_mathml<'m>(mathml: Element, doc: Document<'m>) -> Result> { - let intent_tree = intent_rules(&INTENT_RULES, doc, mathml, "")?; - doc.root().append_child(intent_tree); - return Ok(intent_tree); + INTENT_RULES.with_borrow_mut(|rules| rules.read_files())?; + return INTENT_RULES.with_borrow(|rules| { + return crate::definitions::SPEECH_DEFINITIONS.with_borrow(|definitions| { + return intent_from_rules_and_mathml(rules, definitions, mathml, doc); + }); + }); } pub fn speak_mathml(mathml: Element, nav_node_id: &str) -> Result { @@ -83,40 +91,65 @@ pub fn overview_mathml(mathml: Element, nav_node_id: &str) -> Result { return speak_rules(&OVERVIEW_RULES, mathml, nav_node_id); } - -fn intent_rules<'m>(rules: &'static std::thread::LocalKey>, doc: Document<'m>, mathml: Element, nav_node_id: &'m str) -> Result> { - rules.with(|rules| { - rules.borrow_mut().read_files()?; - let rules = rules.borrow(); - // debug!("intent_rules:\n{}", mml_to_string(mathml)); - let should_set_literal_intent = rules.pref_manager.borrow().pref_to_string("SpeechStyle").as_str() == "LiteralSpeak"; - let original_intent = mathml.attribute_value("intent"); - if should_set_literal_intent { - if let Some(intent) = original_intent { - let intent = if intent.contains('(') {intent.replace('(', ":literal(")} else {intent.to_string() + ":literal"}; - mathml.set_attribute_value("intent", &intent); - } else { - mathml.set_attribute_value("intent", ":literal"); - }; - } - let mut rules_with_context = SpeechRulesWithContext::new(&rules, doc, nav_node_id); - let intent = rules_with_context.match_pattern::>(mathml) - .chain_err(|| "Pattern match/replacement failure!")?; - let answer = if name(intent) == "TEMP_NAME" { // unneeded extra layer - assert_eq!(intent.children().len(), 1); - as_element(intent.children()[0]) +fn apply_intent_to_mathml<'m>( + rules: &SpeechRules, definitions: &Definitions, + doc: Document<'m>, mathml: Element, nav_node_id: &'m str) -> Result> { + let should_set_literal_intent = rules.pref_manager.borrow().pref_to_string("SpeechStyle").as_str() == "LiteralSpeak"; + let original_intent = mathml.attribute_value("intent"); + if should_set_literal_intent { + if let Some(intent) = original_intent { + let intent = if intent.contains('(') {intent.replace('(', ":literal(")} else {intent.to_string() + ":literal"}; + mathml.set_attribute_value("intent", &intent); } else { - intent + mathml.set_attribute_value("intent", ":literal"); }; - if should_set_literal_intent { - if let Some(original_intent) = original_intent { - mathml.set_attribute_value("intent", original_intent); - } else { - mathml.remove_attribute("intent"); + } + let mut rules_with_context = SpeechRulesWithContext::new(&rules, &definitions, doc, nav_node_id); + let intent = rules_with_context.match_pattern::>(mathml) + .chain_err(|| "Pattern match/replacement failure!")?; + let answer = if name(intent) == "TEMP_NAME" { // unneeded extra layer + assert_eq!(intent.children().len(), 1); + as_element(intent.children()[0]) + } else { + intent + }; + if should_set_literal_intent { + if let Some(original_intent) = original_intent { + mathml.set_attribute_value("intent", original_intent); + } else { + mathml.remove_attribute("intent"); + } + } + return Ok(answer); +} + +/// Speak the MathML +/// If 'nav_node_id' is not an empty string, then the element with that id will have [[...]] around it +pub fn mathml_node_to_spoken_text(rules: &SpeechRules, definitions: &Definitions, mathml: Element, nav_node_id: &str) -> Result { + let new_package = Package::new(); + let mut rules_with_context = SpeechRulesWithContext::new(&rules, &definitions, new_package.as_document(), nav_node_id); + let mut speech_string = rules_with_context.match_pattern::(mathml) + .chain_err(|| "Pattern match/replacement failure!")?; + // debug!("speak_rules: nav_node_id={}, mathml id={}, speech_string='{}'", nav_node_id, mathml.attribute_value("id").unwrap_or_default(), &speech_string); + // Note: [[...]] is added around a matching child, but if the "id" is on 'mathml', the whole string is used + if !nav_node_id.is_empty() { + // See https://github.com/NSoiffer/MathCAT/issues/174 for why we can just start the speech at the nav node + if let Some(start) = speech_string.find("[[") { + match speech_string[start+2..].find("]]") { + None => bail!("Internal error: looking for '[[...]]' during navigation -- only found '[[' in '{}'", speech_string), + Some(end) => speech_string = speech_string[start+2..start+2+end].to_string(), } + } else { + bail!(NAV_NODE_SPEECH_NOT_FOUND); // NAV_NODE_SPEECH_NOT_FOUND is tested for later } - return Ok(answer); - }) + } + Ok(rules.pref_manager.borrow().get_tts() + .merge_pauses(remove_optional_indicators( + &speech_string.replace(CONCAT_STRING, "") + .replace(CONCAT_INDICATOR, "") + ) + .trim_start().trim_end_matches([' ', ',', ';']))) + } /// Speak the MathML @@ -125,30 +158,10 @@ fn speak_rules(rules: &'static std::thread::LocalKey>, math rules.with(|rules| { rules.borrow_mut().read_files()?; let rules = rules.borrow(); - // debug!("speak_rules:\n{}", mml_to_string(mathml)); - let new_package = Package::new(); - let mut rules_with_context = SpeechRulesWithContext::new(&rules, new_package.as_document(), nav_node_id); - let mut speech_string = rules_with_context.match_pattern::(mathml) - .chain_err(|| "Pattern match/replacement failure!")?; - // debug!("speak_rules: nav_node_id={}, mathml id={}, speech_string='{}'", nav_node_id, mathml.attribute_value("id").unwrap_or_default(), &speech_string); - // Note: [[...]] is added around a matching child, but if the "id" is on 'mathml', the whole string is used - if !nav_node_id.is_empty() { - // See https://github.com/NSoiffer/MathCAT/issues/174 for why we can just start the speech at the nav node - if let Some(start) = speech_string.find("[[") { - match speech_string[start+2..].find("]]") { - None => bail!("Internal error: looking for '[[...]]' during navigation -- only found '[[' in '{}'", speech_string), - Some(end) => speech_string = speech_string[start+2..start+2+end].to_string(), - } - } else { - bail!(NAV_NODE_SPEECH_NOT_FOUND); // NAV_NODE_SPEECH_NOT_FOUND is tested for later - } - } - return Ok( rules.pref_manager.borrow().get_tts() - .merge_pauses(remove_optional_indicators( - &speech_string.replace(CONCAT_STRING, "") - .replace(CONCAT_INDICATOR, "") - ) - .trim_start().trim_end_matches([' ', ',', ';'])) ); + crate::definitions::SPEECH_DEFINITIONS.with_borrow(|definitions| + // debug!("speak_rules:\n{}", mml_to_string(mathml)); + mathml_node_to_spoken_text(&rules, definitions, mathml, nav_node_id) + ) }) } @@ -1668,10 +1681,10 @@ impl fmt::Display for ContextStack<'_> { } impl<'c, 'r> ContextStack<'c> { - fn new<'a,>(pref_manager: &'a PreferenceManager) -> ContextStack<'c> { + fn new<'a,>(pref_manager: &'a PreferenceManager, definitions: &'a Definitions) -> ContextStack<'c> { let prefs = pref_manager.merge_prefs(); let mut context_stack = ContextStack { - base: ContextStack::base_context(prefs), + base: ContextStack::base_context(pref_manager, definitions, prefs), old_values: Vec::with_capacity(31) // should avoid allocations }; // FIX: the list of variables to set should come from definitions.yaml @@ -1683,10 +1696,10 @@ impl<'c, 'r> ContextStack<'c> { return context_stack; } - fn base_context(var_defs: PreferenceHashMap) -> Context<'c> { + fn base_context<'a>(pref_manager: &'a PreferenceManager, definitions: &'a Definitions, var_defs: PreferenceHashMap) -> Context<'c> { let mut context = Context::new(); context.set_namespace("m", "http://www.w3.org/1998/Math/MathML"); - crate::xpath_functions::add_builtin_functions(&mut context); + crate::xpath_functions::add_builtin_functions(&pref_manager, &definitions, &mut context); for (key, value) in var_defs { context.set_variable(key.as_str(), yaml_to_value(&value)); // if let Some(str_value) = value.as_str() { @@ -2125,23 +2138,23 @@ thread_local!{ /// The current set of speech rules // maybe this should be a small cache of rules in case people switch rules/prefs? pub static INTENT_RULES: RefCell = - RefCell::new( SpeechRules::new(RulesFor::Intent, true) ); + RefCell::new( SpeechRules::new(RulesFor::Intent, true, PreferenceManager::get()) ); pub static SPEECH_RULES: RefCell = - RefCell::new( SpeechRules::new(RulesFor::Speech, true) ); + RefCell::new( SpeechRules::new(RulesFor::Speech, true, PreferenceManager::get()) ); pub static OVERVIEW_RULES: RefCell = - RefCell::new( SpeechRules::new(RulesFor::OverView, true) ); + RefCell::new( SpeechRules::new(RulesFor::OverView, true, PreferenceManager::get()) ); pub static NAVIGATION_RULES: RefCell = - RefCell::new( SpeechRules::new(RulesFor::Navigation, true) ); + RefCell::new( SpeechRules::new(RulesFor::Navigation, true, PreferenceManager::get()) ); pub static BRAILLE_RULES: RefCell = - RefCell::new( SpeechRules::new(RulesFor::Braille, false) ); + RefCell::new( SpeechRules::new(RulesFor::Braille, false, PreferenceManager::get()) ); } impl SpeechRules { - pub fn new(name: RulesFor, translate_single_chars_only: bool) -> SpeechRules { + pub fn new(name: RulesFor, translate_single_chars_only: bool, pref_manager: Rc>) -> SpeechRules { let globals = if name == RulesFor::Braille { ( (BRAILLE_UNICODE_SHORT.with(Rc::clone), BRAILLE_UNICODE_SHORT_FILES_AND_TIMES.with(Rc::clone)), @@ -2167,9 +2180,25 @@ impl SpeechRules { unicode_full_files: globals.1.1, definitions_files: globals.2, translate_single_chars_only, - pref_manager: PreferenceManager::get(), + pref_manager: pref_manager, }; -} + } + + pub fn new_stateless(name: RulesFor, translate_single_chars_only: bool, pref_manager: Rc>) -> SpeechRules { + return SpeechRules { + error: Default::default(), + name, + rules: HashMap::with_capacity(if name == RulesFor::Intent || name == RulesFor::Speech {500} else {50}), // lazy load them + rule_files: FilesAndTimes::default(), + unicode_short: Rc::new(RefCell::new(HashMap::with_capacity(500))), + unicode_short_files: Rc::new(RefCell::new(FilesAndTimes::default())), + unicode_full: Rc::new(RefCell::new(HashMap::with_capacity(6500))), + unicode_full_files: Rc::new(RefCell::new(FilesAndTimes::default())), + definitions_files: Rc::new(RefCell::new(FilesAndTimes::default())), + translate_single_chars_only, + pref_manager: pref_manager, + }; + } pub fn get_error(&self) -> Option<&str> { return if self.error.is_empty() { @@ -2180,6 +2209,21 @@ impl SpeechRules { } pub fn read_files(&mut self) -> Result<()> { + use crate::definitions::SPEECH_DEFINITIONS; + use crate::definitions::BRAILLE_DEFINITIONS; + let definitions = if self.name != RulesFor::Braille {&SPEECH_DEFINITIONS} else {&BRAILLE_DEFINITIONS}; + return definitions.with(|definitions| { + // If definitions are already borrow, we assume they've been read up the stack and skip reading them. + let mut try_defs = definitions.try_borrow_mut(); + let maybe_defs: Option<&mut Definitions> = match try_defs { + Ok(ref mut defs) => Some(defs), + Err(_) => None, + }; + return self.read_files_for_stateless(maybe_defs); + }); + } + + pub fn read_files_for_stateless(&mut self, definitions: Option<&mut Definitions>) -> Result<()> { let check_rule_files = self.pref_manager.borrow().pref_to_string("CheckRuleFiles"); if check_rule_files != "None" { // "Prefs" or "All" are other values self.pref_manager.borrow_mut().set_preference_files()?; @@ -2200,15 +2244,25 @@ impl SpeechRules { self.unicode_short_files.borrow_mut().set_files_and_times(self.read_unicode(None, true)?); } + if let Some(definitions) = definitions { + self.read_definitions(should_ignore_file_time, definitions)?; + } + return Ok( () ); + } + + fn read_definitions(&self, should_ignore_file_time: bool, definitions: &mut Definitions) -> Result<()> { + let pref_manager = self.pref_manager.borrow(); if self.definitions_files.borrow().ft.is_empty() || !self.definitions_files.borrow().is_file_up_to_date( pref_manager.get_definitions_file(self.name != RulesFor::Braille), should_ignore_file_time ) { - self.definitions_files.borrow_mut().set_files_and_times(read_definitions_file(self.name != RulesFor::Braille)?); + self.definitions_files.borrow_mut().set_files_and_times( + read_definitions_file(pref_manager.get_definitions_file(self.name != RulesFor::Braille), definitions)?); } - return Ok( () ); + Ok(()) } + fn read_patterns(&mut self, path: &Path) -> Result> { // info!("Reading rule file: {}", p.to_str().unwrap()); let rule_file_contents = read_to_string_shim(path).chain_err(|| format!("cannot read file '{}'", path.to_str().unwrap()))?; @@ -2311,13 +2365,13 @@ impl SpeechRules { /// We track three different lifetimes: /// 'c -- the lifetime of the context and mathml -/// 's -- the lifetime of the speech rules (which is static) +/// 's -- the lifetime of the speech rules and definitions /// 'r -- the lifetime of the reference (this seems to be key to keep the rust memory checker happy) impl<'c, 's:'c, 'r, 'm:'c> SpeechRulesWithContext<'c, 's,'m> { - pub fn new(speech_rules: &'s SpeechRules, doc: Document<'m>, nav_node_id: &'m str) -> SpeechRulesWithContext<'c, 's, 'm> { + pub fn new(speech_rules: &'s SpeechRules, definitions: &'s Definitions, doc: Document<'m>, nav_node_id: &'m str) -> SpeechRulesWithContext<'c, 's, 'm> { return SpeechRulesWithContext { speech_rules, - context_stack: ContextStack::new(&speech_rules.pref_manager.borrow()), + context_stack: ContextStack::new(&speech_rules.pref_manager.borrow(), &definitions), doc, nav_node_id, inside_spell: false, @@ -2673,12 +2727,13 @@ impl<'c, 's:'c, 'r, 'm:'c> SpeechRulesWithContext<'c, 's,'m> { /// Hack to allow replacement of `str` with braille chars. pub fn braille_replace_chars(str: &str, mathml: Element) -> Result { - return BRAILLE_RULES.with(|rules| { - let rules = rules.borrow(); - let new_package = Package::new(); - let mut rules_with_context = SpeechRulesWithContext::new(&rules, new_package.as_document(), ""); - return rules_with_context.replace_chars(str, mathml); - }) + return BRAILLE_RULES.with_borrow(|rules| { + return SPEECH_DEFINITIONS.with_borrow(|definitions| { + let new_package = Package::new(); + let mut rules_with_context = SpeechRulesWithContext::new(&rules, definitions, new_package.as_document(), ""); + return rules_with_context.replace_chars(str, mathml); + }); + }); } @@ -2696,7 +2751,7 @@ mod tests { {name: default, tag: math, match: ".", replace: [x: "./*"] }"#; let doc = YamlLoader::load_from_str(str).unwrap(); assert_eq!(doc.len(), 1); - let mut rules = SpeechRules::new(RulesFor::Speech, true); + let mut rules = SpeechRules::new(RulesFor::Speech, true, PreferenceManager::get()); SpeechPattern::build(&doc[0], Path::new("testing"), &mut rules).unwrap(); assert_eq!(rules.rules["math"].len(), 1, "\nshould only be one rule"); @@ -2715,7 +2770,7 @@ mod tests { {name: default, tag: math, match: ".", replace: [x: "./*"] }"#; let doc = YamlLoader::load_from_str(str).unwrap(); assert_eq!(doc.len(), 1); - let mut rules = SpeechRules::new(RulesFor::Speech, true); + let mut rules = SpeechRules::new(RulesFor::Speech, true, PreferenceManager::get()); SpeechPattern::build(&doc[0], Path::new("testing"), &mut rules).unwrap(); let str = r#"--- @@ -2738,7 +2793,7 @@ mod tests { {name: default, tag: math, match: ".", replace: [x: "./*"] }"#; let doc = YamlLoader::load_from_str(str).unwrap(); assert_eq!(doc.len(), 1); - let mut rules = SpeechRules::new(RulesFor::Speech, true); + let mut rules = SpeechRules::new(RulesFor::Speech, true, PreferenceManager::get()); SpeechPattern::build(&doc[0], Path::new("testing"), &mut rules).unwrap(); let str = r#"--- @@ -2845,4 +2900,4 @@ mod tests { // assert_eq!(result.unwrap(), r#"DEBUG(*[2]/*[3][DEBUG(text()='(')], "DEBUG(*[2]/*[3][DEBUG(text()='(')], \"text()='(')]\")"#); // } -} \ No newline at end of file +} diff --git a/src/stateless_interface.rs b/src/stateless_interface.rs new file mode 100644 index 00000000..5d77e86d --- /dev/null +++ b/src/stateless_interface.rs @@ -0,0 +1,137 @@ +//! This interface module provides a stateless API for converting from LaTeX to verbalized text. +//! +#![allow(non_snake_case)] +#![allow(clippy::needless_return)] +use std::cell::RefCell; +use std::path::Path; +use std::rc::Rc; + +use regex::{Captures, Regex}; + +use crate::canonicalize::CanonicalizeContext; +use crate::definitions::Definitions; +use crate::element_util::{add_ids, get_element, trim_element}; +use crate::errors::*; +use crate::prefs::{PreferenceManager, PreferenceManagerBuilder}; +use crate::pretty_print::mml_to_string; +use crate::speech::RulesFor; +use crate::speech::SpeechRules; +use sxd_document::{Package, parser}; +use sxd_document::dom::Element; + +// Used by include!("entities.in") call below. +use phf::phf_map; + +/// Context for MathML to verbalized text conversion. +pub struct MathCat { + speech_rules: SpeechRules, + speech_definitions: Definitions, + canonicalize_context: CanonicalizeContext, +} + +/// Builds a MathCat instance. +/// +/// Builders are not thread-safe, i.e. multiple builders should not be used concurrently. +pub struct MathCatBuilder { + pref_manager_builder: PreferenceManagerBuilder, +} + +impl MathCatBuilder { + pub fn new() -> MathCatBuilder { + MathCatBuilder { pref_manager_builder: PreferenceManagerBuilder::new() } + } + + // Sets the rules directory. + pub fn set_rules_dir(&mut self, path: &Path) { + self.pref_manager_builder.set_rules_dir(path); + } + + /// Set the string-valued preference. + pub fn set_pref(&mut self, key: &str, value: &str) { + self.pref_manager_builder.set_string_pref(key, value); + } + + pub fn build(self) -> Result { + let pref_manager: Rc> = self.pref_manager_builder.build()?; + let canonicalize_context = CanonicalizeContext::new_uncached(&pref_manager.borrow()); + let mut speech_rules = SpeechRules::new_stateless(RulesFor::Speech, false, pref_manager); + let mut speech_definitions = Definitions::default(); + speech_rules.read_files_for_stateless(Some(&mut speech_definitions))?; + return Ok(MathCat { + speech_rules: speech_rules, + speech_definitions: speech_definitions, + canonicalize_context: canonicalize_context, + }); + } +} + +impl<'a> MathCat { + /// Returns the spoken text of the given MathML string using the given rules. + pub fn mathml_to_spoken_text(&self, mathml: &str) -> Result { + let (package, _) = create_mathml_instance_with_text(&self.canonicalize_context, &self.speech_definitions, mathml)?; + let mathml = get_element(&package); + let new_package = Package::new(); + let intent = crate::speech::intent_from_rules_and_mathml(&self.speech_rules, &self.speech_definitions, mathml, new_package.as_document())?; + return crate::speech::mathml_node_to_spoken_text(&self.speech_rules, &self.speech_definitions, intent, ""); + } +} + +// wrap up some common functionality between the call from 'main' and AT +pub(crate) fn cleanup_mathml<'a>(context: &CanonicalizeContext, definitions: &Definitions, mathml: Element<'a>) -> Result> { + // TODO: Canonicalization does not seem to actually use rules? + trim_element(mathml, false); + let mathml = context.canonicalize(definitions, mathml)?; + let mathml = add_ids(mathml); + return Ok(mathml); +} + +/// Populates the `package` MathML instance. +/// This returns canonical MathML with 'id's set on any node that doesn't have an id. +/// The ids can be used for sync highlighting if the `Bookmark` API preference is true. +pub(crate) fn create_mathml_instance_with_text(canonicalize_context: &CanonicalizeContext, definitions: &Definitions, mathml_str: &str) -> Result<(Package, String)> { + lazy_static! { + // if these are present when resent to MathJaX, MathJaX crashes (https://github.com/mathjax/MathJax/issues/2822) + static ref MATHJAX_V2: Regex = Regex::new(r#"class *= *['"]MJX-.*?['"]"#).unwrap(); + static ref MATHJAX_V3: Regex = Regex::new(r#"class *= *['"]data-mjx-.*?['"]"#).unwrap(); + static ref NAMESPACE_DECL: Regex = Regex::new(r#"xmlns:[[:alpha:]]+"#).unwrap(); // very limited namespace prefix match + static ref PREFIX: Regex = Regex::new(r#"( = include!("entities.in"); + + let mut error_message = "".to_string(); // can't return a result inside the replace_all, so we do this hack of setting the message and then returning the error + // need to deal with character data and convert to something the parser knows + let mathml_str = + HTML_ENTITIES.replace_all(&mathml_str, |cap: &Captures| match HTML_ENTITIES_MAPPING.get(&cap[1]) { + None => { + error_message = format!("No entity named '{}'", &cap[0]); + cap[0].to_string() + } + Some(&ch) => ch.to_string(), + }); + + if !error_message.is_empty() { + bail!(error_message); + } + let mathml_str = MATHJAX_V2.replace_all(&mathml_str, ""); + let mathml_str = MATHJAX_V3.replace_all(&mathml_str, ""); + + // the speech rules use the xpath "name" function and that includes the prefix + // getting rid of the prefix properly probably involves a recursive replacement in the tree + // if the prefix is used, it is almost certainly something like "m" or "mml", so this cheat will work. + let mathml_str = NAMESPACE_DECL.replace(&mathml_str, "xmlns"); // do this before the PREFIX replace! + let mathml_str = PREFIX.replace_all(&mathml_str, "$1"); + + let package = parser::parse(&mathml_str); + if let Err(e) = package { + bail!("Invalid MathML input:\n{}\nError is: {}", &mathml_str, &e.to_string()); + } + + let package = package.unwrap(); + let mathml = get_element(&package); + let mathml = cleanup_mathml(&canonicalize_context, definitions, mathml)?; + let mathml_string = mml_to_string(mathml); + + Ok((package, mathml_string)) +} diff --git a/src/xpath_functions.rs b/src/xpath_functions.rs index 185d2f75..2af19d66 100644 --- a/src/xpath_functions.rs +++ b/src/xpath_functions.rs @@ -20,6 +20,7 @@ use sxd_document::dom::{Element, ChildOfElement}; use sxd_xpath::{Value, Context, context, function::*, nodeset::*}; use crate::definitions::{Definitions, SPEECH_DEFINITIONS, BRAILLE_DEFINITIONS}; +use crate::prefs::PreferenceManager; use regex::Regex; use crate::pretty_print::mml_to_string; use std::cell::{Ref, RefCell}; @@ -420,25 +421,26 @@ impl Function for IsNode { } } -struct ToOrdinal; -impl ToOrdinal { +#[derive(Clone)] +struct ToOrdinal<'a> { + pref_manager: &'a PreferenceManager, + definitions: &'a Definitions, +} +impl ToOrdinal<'_> { // ordinals often have an irregular start (e.g., "half") before becoming regular. // if the number is irregular, return the ordinal form, otherwise return 'None'. - fn compute_irregular_fractional_speech(number: &str, plural: bool) -> Option { - SPEECH_DEFINITIONS.with(|definitions| { - let definitions = definitions.borrow(); - let words = if plural { - definitions.get_vec("NumbersOrdinalFractionalPluralOnes")? - } else { - definitions.get_vec("NumbersOrdinalFractionalOnes")? - }; - let number_as_int: usize = number.parse().unwrap(); // already verified it is only digits - if number_as_int < words.len() { - // use the words associated with this irregular pattern. - return Some( words[number_as_int].clone() ); - }; - return None; - }) + fn compute_irregular_fractional_speech(&self, number: &str, plural: bool) -> Option { + let words = if plural { + self.definitions.get_vec("NumbersOrdinalFractionalPluralOnes")? + } else { + self.definitions.get_vec("NumbersOrdinalFractionalOnes")? + }; + let number_as_int: usize = number.parse().unwrap(); // already verified it is only digits + if number_as_int < words.len() { + // use the words associated with this irregular pattern. + return Some( words[number_as_int].clone() ); + }; + return None; } /** @@ -448,136 +450,131 @@ impl ToOrdinal { * plural -- true if answer should be plural * Returns the string representation of that number or an error message */ - fn convert(number: &str, fractional: bool, plural: bool) -> Option { + fn convert(&self, number: &str, fractional: bool, plural: bool) -> Option { lazy_static! { static ref NO_DIGIT: Regex = Regex::new(r"[^\d]").unwrap(); // match anything except a digit } - return SPEECH_DEFINITIONS.with(|definitions| { - let definitions = definitions.borrow(); - let numbers_large = definitions.get_vec("NumbersLarge")?; + let numbers_large = self.definitions.get_vec("NumbersLarge")?; - let pref_manager = crate::prefs::PreferenceManager::get(); - let pref_manager = pref_manager.borrow(); - let block_separators = pref_manager.pref_to_string("BlockSeparators"); - let decimal_separator = pref_manager.pref_to_string("DecimalSeparators"); + let block_separators = self.pref_manager.pref_to_string("BlockSeparators"); + let decimal_separator = self.pref_manager.pref_to_string("DecimalSeparators"); - // check number validity (has digits, not a decimal) - if number.is_empty() || number.contains(&decimal_separator) { - return Some(String::from(number)); - } - // remove any block separators - let number = match clean_number(number, &block_separators) { - None => return Some(String::from(number)), - Some(num) => num, - }; - - // check to see if the number is too big or is not an integer or has non-digits - if number.len() > 3*numbers_large.len() { - return Some(number); - } - if NO_DIGIT.is_match(&number) { - // this shouldn't have been part of an mn, so likely an error. Log a warning - // FIX: log a warning that a non-number was passed to convert() - return Some(number); - } + // check number validity (has digits, not a decimal) + if number.is_empty() || number.contains(&decimal_separator) { + return Some(String::from(number)); + } + // remove any block separators + let number = match clean_number(number, &block_separators) { + None => return Some(String::from(number)), + Some(num) => num, + }; - // first deal with the abnormalities of fractional ordinals (one half, etc). That simplifies what remains - if fractional { - if let Some(string) = ToOrdinal::compute_irregular_fractional_speech(&number, plural) { - return Some(string); - } - } + // check to see if the number is too big or is not an integer or has non-digits + if number.len() > 3*numbers_large.len() { + return Some(number); + } + if NO_DIGIT.is_match(&number) { + // this shouldn't have been part of an mn, so likely an error. Log a warning + // FIX: log a warning that a non-number was passed to convert() + return Some(number); + } - // at this point, we only need to worry about singular/plural distinction + // first deal with the abnormalities of fractional ordinals (one half, etc). That simplifies what remains + if fractional { + if let Some(string) = self.compute_irregular_fractional_speech(&number, plural) { + return Some(string); + } + } - // break into groups of three digits and add 10^3 word (thousands, millions, ...) after each chunk - // FIX: add a pause between groups of three -- need to use TTS-specific pause + // at this point, we only need to worry about singular/plural distinction - // handle special case of trailing zeros - // num_thousands_at_end represents the amount to shift NumbersLarge... (e.g., millions->thousands) - let num_thousands_at_end = match number.rfind(|ch| ch > '0') { // last non-0 on right - Some(n) => (number.len() - 1 - n) / 3 , - None => 0 - }; - let (number,_) = number.split_at(number.len() - 3 * num_thousands_at_end); // drop the 0s + // break into groups of three digits and add 10^3 word (thousands, millions, ...) after each chunk + // FIX: add a pause between groups of three -- need to use TTS-specific pause - // everything is simplified if we add zeros at the start so that block size is a factor of 3 - let number = match number.len() % 3 { - 0 => "".to_string() + number, - 1 => "00".to_string() + number, - _ => "0".to_string() + number, // can only be "2" -- compiler doesn't know there aren't other options - }; + // handle special case of trailing zeros + // num_thousands_at_end represents the amount to shift NumbersLarge... (e.g., millions->thousands) + let num_thousands_at_end = match number.rfind(|ch| ch > '0') { // last non-0 on right + Some(n) => (number.len() - 1 - n) / 3 , + None => 0 + }; + let (number,_) = number.split_at(number.len() - 3 * num_thousands_at_end); // drop the 0s - // At this point we have at least three "digits", and length is a multiple of 3 - // We have already verified that there are only ASCII digits, so we can subtract '0' to get an index - const ASCII_0: usize = 48; - let digits = number.as_bytes() - .iter() - .map(|&byte| byte as usize - ASCII_0) - .collect::>(); - - let mut answer = String::with_capacity(255); // reasonable max most of the time - let large_words = numbers_large; - if digits.len() > 3 { - // speak this first groups as cardinal numbers - let words = [ - definitions.get_vec("NumbersHundreds")?, - definitions.get_vec("NumbersTens")?, - definitions.get_vec("NumbersOnes")?, - ]; - answer = digits[0..digits.len()-3] - .chunks(3) - .enumerate() - .map(|(i, chunk)| { - if chunk[0] != 0 || chunk[1] != 0 || chunk[2] != 0 { - Some(ToOrdinal::hundreds_to_words(chunk, &words)? + " " + - &large_words[num_thousands_at_end + digits.len()/3 - 1 - i] + " ") - } else { - Some("".to_string()) - } - }) - .collect::>>()? - .join(""); // can't use " " because 1000567 would get extra space in the middle - if num_thousands_at_end > 0 { - // add on "billionths", etc and we are done - let large_words = if plural { - definitions.get_vec("NumbersOrdinalPluralLarge") - } else { - definitions.get_vec("NumbersOrdinalLarge") - }; - return Some(answer + &large_words?[num_thousands_at_end]); - } - }; + // everything is simplified if we add zeros at the start so that block size is a factor of 3 + let number = match number.len() % 3 { + 0 => "".to_string() + number, + 1 => "00".to_string() + number, + _ => "0".to_string() + number, // can only be "2" -- compiler doesn't know there aren't other options + }; - // all that is left is to speak the hundreds part, possibly followed by "thousands", "billions", etc - let words = match (num_thousands_at_end > 0, plural) { - (true, _) => [ - definitions.get_vec("NumbersHundreds")?, - definitions.get_vec("NumbersTens")?, - definitions.get_vec("NumbersOnes")?, - ], - (false, true) => [ - definitions.get_vec("NumbersOrdinalPluralHundreds")?, - definitions.get_vec("NumbersOrdinalPluralTens")?, - definitions.get_vec("NumbersOrdinalPluralOnes")?, - ], - (false, false) => [ - definitions.get_vec("NumbersOrdinalHundreds")?, - definitions.get_vec("NumbersOrdinalTens")?, - definitions.get_vec("NumbersOrdinalOnes")?, - ], - }; - answer += &ToOrdinal::hundreds_to_words(&digits[digits.len()-3..], &words)?; + // At this point we have at least three "digits", and length is a multiple of 3 + // We have already verified that there are only ASCII digits, so we can subtract '0' to get an index + const ASCII_0: usize = 48; + let digits = number.as_bytes() + .iter() + .map(|&byte| byte as usize - ASCII_0) + .collect::>(); + + let mut answer = String::with_capacity(255); // reasonable max most of the time + let large_words = numbers_large; + if digits.len() > 3 { + // speak this first groups as cardinal numbers + let words = [ + self.definitions.get_vec("NumbersHundreds")?, + self.definitions.get_vec("NumbersTens")?, + self.definitions.get_vec("NumbersOnes")?, + ]; + answer = digits[0..digits.len()-3] + .chunks(3) + .enumerate() + .map(|(i, chunk)| { + if chunk[0] != 0 || chunk[1] != 0 || chunk[2] != 0 { + Some(self.hundreds_to_words(chunk, &words)? + " " + + &large_words[num_thousands_at_end + digits.len()/3 - 1 - i] + " ") + } else { + Some("".to_string()) + } + }) + .collect::>>()? + .join(""); // can't use " " because 1000567 would get extra space in the middle if num_thousands_at_end > 0 { + // add on "billionths", etc and we are done let large_words = if plural { - definitions.get_vec("NumbersOrdinalPluralLarge")? + self.definitions.get_vec("NumbersOrdinalPluralLarge") } else { - definitions.get_vec("NumbersOrdinalLarge")? + self.definitions.get_vec("NumbersOrdinalLarge") }; - answer = answer + " " + &large_words[num_thousands_at_end]; + return Some(answer + &large_words?[num_thousands_at_end]); } - return Some(answer); - }); + }; + + // all that is left is to speak the hundreds part, possibly followed by "thousands", "billions", etc + let words = match (num_thousands_at_end > 0, plural) { + (true, _) => [ + self.definitions.get_vec("NumbersHundreds")?, + self.definitions.get_vec("NumbersTens")?, + self.definitions.get_vec("NumbersOnes")?, + ], + (false, true) => [ + self.definitions.get_vec("NumbersOrdinalPluralHundreds")?, + self.definitions.get_vec("NumbersOrdinalPluralTens")?, + self.definitions.get_vec("NumbersOrdinalPluralOnes")?, + ], + (false, false) => [ + self.definitions.get_vec("NumbersOrdinalHundreds")?, + self.definitions.get_vec("NumbersOrdinalTens")?, + self.definitions.get_vec("NumbersOrdinalOnes")?, + ], + }; + answer += &self.hundreds_to_words(&digits[digits.len()-3..], &words)?; + if num_thousands_at_end > 0 { + let large_words = if plural { + self.definitions.get_vec("NumbersOrdinalPluralLarge")? + } else { + self.definitions.get_vec("NumbersOrdinalLarge")? + }; + answer = answer + " " + &large_words[num_thousands_at_end]; + } + return Some(answer); /// Remove block separators and convert alphanumeric digits to ascii digits fn clean_number(number: &str, block_separators: &str) -> Option { @@ -605,34 +602,31 @@ impl ToOrdinal { } - fn hundreds_to_words(number: &[usize], words: &[Ref>; 3]) -> Option { + fn hundreds_to_words(&self, number: &[usize], words: &[Ref>; 3]) -> Option { assert!( number.len() == 3 ); - return SPEECH_DEFINITIONS.with(|definitions| { - let definitions = definitions.borrow(); - if number[0] != 0 && number[1] == 0 && number[2] == 0 { - return Some(words[0][number[0]].clone()); - } + if number[0] != 0 && number[1] == 0 && number[2] == 0 { + return Some(words[0][number[0]].clone()); + } - let mut hundreds = definitions.get_vec("NumbersHundreds")?[number[0]].clone(); - if !hundreds.is_empty() { - hundreds += " "; - } + let mut hundreds = self.definitions.get_vec("NumbersHundreds")?[number[0]].clone(); + if !hundreds.is_empty() { + hundreds += " "; + } - if number[1] != 0 && number[2] == 0 { - return Some(hundreds + &words[1][number[1]]); - } + if number[1] != 0 && number[2] == 0 { + return Some(hundreds + &words[1][number[1]]); + } - if 10*number[1] < words[2].len() { - // usurp regular ordering to handle something like '14' - return Some(hundreds + &words[2][10*number[1] + number[2]]); - } else { - return Some(hundreds + &definitions.get_vec("NumbersTens")?[number[1]] + " " + &words[2][number[2]]); - } - }); + if 10*number[1] < words[2].len() { + // usurp regular ordering to handle something like '14' + return Some(hundreds + &words[2][10*number[1] + number[2]]); + } else { + return Some(hundreds + &self.definitions.get_vec("NumbersTens")?[number[1]] + " " + &words[2][number[2]]); + } } } -impl Function for ToOrdinal { +impl Function for ToOrdinal<'_> { // convert a node to an ordinal number fn evaluate<'d>(&self, _context: &context::Evaluation<'_, 'd>, @@ -652,12 +646,12 @@ impl Function for ToOrdinal { let node = validate_one_node(args.pop_nodeset()?, "ToOrdinal")?; return match node { Node::Text(t) => Ok( Value::String( - match ToOrdinal::convert(t.text(), fractional, plural) { + match self.convert(t.text(), fractional, plural) { None => t.text().to_string(), Some(ord) => ord, } ) ), Node::Element(e) => Ok( Value::String( - match ToOrdinal::convert(&get_text_from_element(e), fractional, plural) { + match self.convert(&get_text_from_element(e), fractional, plural) { None => get_text_from_element(e).to_string(), Some(ord) => ord, } ) ), @@ -667,9 +661,11 @@ impl Function for ToOrdinal { } -struct ToCommonFraction; +struct ToCommonFraction<'a> { + to_ordinal: ToOrdinal<'a>, +} -impl Function for ToCommonFraction { +impl Function for ToCommonFraction<'_> { // convert a node to a common fraction (if the numerator and denominator are within given limits) fn evaluate<'d>(&self, _context: &context::Evaluation<'_, 'd>, @@ -693,7 +689,7 @@ impl Function for ToCommonFraction { let denom = children[1].element().unwrap(); let denom = get_text_from_element( denom ); let mut answer = num.clone() + " "; - answer += &match ToOrdinal::convert(&denom, true, num!="1") { + answer += &match self.to_ordinal.convert(&denom, true, num!="1") { None => denom, Some(ord) => ord, }; @@ -1417,13 +1413,25 @@ impl Function for ReplaceAll { } /// Add all the functions defined in this module to `context`. -pub fn add_builtin_functions(context: &mut Context) { +pub fn add_builtin_functions<'a: 'b, 'b>(pref_manager: &'a PreferenceManager, definitions: &'a Definitions, context: &'b mut Context) { context.set_function("NestingChars", crate::braille::NemethNestingChars); context.set_function("BrailleChars", crate::braille::BrailleChars); context.set_function("NeedsToBeGrouped", crate::braille::NeedsToBeGrouped); context.set_function("IsNode", IsNode); - context.set_function("ToOrdinal", ToOrdinal); - context.set_function("ToCommonFraction", ToCommonFraction); + + // sxd-xpath `Function` type has a `+ 'static` bound but our + // functions depend on preferences and definitions. + // https://github.com/shepmaster/sxd-xpath/issues/149 + // + // Safe because preferences and definitions are either global + // (stateful API) or outlive the context (not exposed in stateless API). + unsafe fn extend_lifetime<'z>(r: ToOrdinal<'z>) -> ToOrdinal<'static> { + std::mem::transmute::, ToOrdinal<'static>>(r) + } + let to_ordinal = unsafe { extend_lifetime(ToOrdinal { pref_manager, definitions }) }; + context.set_function("ToCommonFraction", ToCommonFraction { to_ordinal: to_ordinal.clone() }); + context.set_function("ToOrdinal", to_ordinal); + context.set_function("IsBracketed", IsBracketed); context.set_function("IsInDefinition", IsInDefinition); context.set_function("DefinitionValue", DefinitionValue); @@ -1449,12 +1457,16 @@ pub fn add_builtin_functions(context: &mut Context) { mod tests { use super::*; use sxd_document::parser; - use crate::interface::{trim_element, get_element}; + use crate::element_util::{trim_element, get_element}; + use crate::prefs::PreferenceManager; fn init_word_list() { crate::interface::set_rules_dir(super::super::abs_rules_dir_path()).unwrap(); - let result = crate::definitions::read_definitions_file(true); + let result = + crate::definitions::SPEECH_DEFINITIONS.with_borrow_mut(|defs| + crate::definitions::read_definitions_file( + PreferenceManager::get().borrow().get_definitions_file(true), defs)); if let Err(e) = result { panic!("unable to read 'Rules/Languages/en/definitions.yaml\n{}", e.to_string()); } @@ -1463,94 +1475,114 @@ mod tests { #[test] fn ordinal_one_digit() { init_word_list(); - assert_eq!("zeroth", ToOrdinal::convert("0", false, false).unwrap()); - assert_eq!("second", ToOrdinal::convert("2", false, false).unwrap()); - assert_eq!("ninth", ToOrdinal::convert("9", false, false).unwrap()); - - assert_eq!("zeroth", ToOrdinal::convert("0", false, true).unwrap()); - assert_eq!("seconds", ToOrdinal::convert("2", false, true).unwrap()); - assert_eq!("ninths", ToOrdinal::convert("9", false, true).unwrap()); - - assert_eq!("first", ToOrdinal::convert("1", true, false).unwrap()); - assert_eq!("half", ToOrdinal::convert("2", true, false).unwrap()); - assert_eq!("half", ToOrdinal::convert("02", true, false).unwrap()); - assert_eq!("ninth", ToOrdinal::convert("9", true, false).unwrap()); - - assert_eq!("halves", ToOrdinal::convert("2", true, true).unwrap()); - assert_eq!("halves", ToOrdinal::convert("002", true, true).unwrap()); - assert_eq!("ninths", ToOrdinal::convert("9", true, true).unwrap()); + SPEECH_DEFINITIONS.with_borrow(|definitions| { + let pref_manager = PreferenceManager::get(); + let to_ordinal = ToOrdinal { pref_manager: &pref_manager.borrow(), definitions: definitions }; + + assert_eq!("zeroth", to_ordinal.convert("0", false, false).unwrap()); + assert_eq!("second", to_ordinal.convert("2", false, false).unwrap()); + assert_eq!("ninth", to_ordinal.convert("9", false, false).unwrap()); + + assert_eq!("zeroth", to_ordinal.convert("0", false, true).unwrap()); + assert_eq!("seconds", to_ordinal.convert("2", false, true).unwrap()); + assert_eq!("ninths", to_ordinal.convert("9", false, true).unwrap()); + + assert_eq!("first", to_ordinal.convert("1", true, false).unwrap()); + assert_eq!("half", to_ordinal.convert("2", true, false).unwrap()); + assert_eq!("half", to_ordinal.convert("02", true, false).unwrap()); + assert_eq!("ninth", to_ordinal.convert("9", true, false).unwrap()); + + assert_eq!("halves", to_ordinal.convert("2", true, true).unwrap()); + assert_eq!("halves", to_ordinal.convert("002", true, true).unwrap()); + assert_eq!("ninths", to_ordinal.convert("9", true, true).unwrap()); + }); } #[test] fn ordinal_two_digit() { init_word_list(); - assert_eq!("tenth", ToOrdinal::convert("10", false, false).unwrap()); - assert_eq!("seventeenth", ToOrdinal::convert("17", false, false).unwrap()); - assert_eq!("thirty second", ToOrdinal::convert("32", false, false).unwrap()); - assert_eq!("fortieth", ToOrdinal::convert("40", false, false).unwrap()); - - assert_eq!("tenths", ToOrdinal::convert("10", false, true).unwrap()); - assert_eq!("sixteenths", ToOrdinal::convert("16", false, true).unwrap()); - assert_eq!("eighty eighths", ToOrdinal::convert("88", false, true).unwrap()); - assert_eq!("fiftieths", ToOrdinal::convert("50", false, true).unwrap()); - - assert_eq!("eleventh", ToOrdinal::convert("11", true, false).unwrap()); - assert_eq!("forty fourth", ToOrdinal::convert("44", true, false).unwrap()); - assert_eq!("ninth", ToOrdinal::convert("9", true, false).unwrap()); - assert_eq!("ninth", ToOrdinal::convert("00000009", true, false).unwrap()); - assert_eq!("sixtieth", ToOrdinal::convert("60", true, false).unwrap()); - - assert_eq!("tenths", ToOrdinal::convert("10", true, true).unwrap()); - assert_eq!("tenths", ToOrdinal::convert("0010", true, true).unwrap()); - assert_eq!("elevenths", ToOrdinal::convert("11", true, true).unwrap()); - assert_eq!("nineteenths", ToOrdinal::convert("19", true, true).unwrap()); - assert_eq!("twentieths", ToOrdinal::convert("20", true, true).unwrap()); - assert_eq!("nineteenths", ToOrdinal::convert("𝟏𝟗", true, true).unwrap()); + SPEECH_DEFINITIONS.with_borrow(|definitions| { + let pref_manager = PreferenceManager::get(); + let to_ordinal = ToOrdinal { pref_manager: &pref_manager.borrow(), definitions: definitions }; + + assert_eq!("tenth", to_ordinal.convert("10", false, false).unwrap()); + assert_eq!("seventeenth", to_ordinal.convert("17", false, false).unwrap()); + assert_eq!("thirty second", to_ordinal.convert("32", false, false).unwrap()); + assert_eq!("fortieth", to_ordinal.convert("40", false, false).unwrap()); + + assert_eq!("tenths", to_ordinal.convert("10", false, true).unwrap()); + assert_eq!("sixteenths", to_ordinal.convert("16", false, true).unwrap()); + assert_eq!("eighty eighths", to_ordinal.convert("88", false, true).unwrap()); + assert_eq!("fiftieths", to_ordinal.convert("50", false, true).unwrap()); + + assert_eq!("eleventh", to_ordinal.convert("11", true, false).unwrap()); + assert_eq!("forty fourth", to_ordinal.convert("44", true, false).unwrap()); + assert_eq!("ninth", to_ordinal.convert("9", true, false).unwrap()); + assert_eq!("ninth", to_ordinal.convert("00000009", true, false).unwrap()); + assert_eq!("sixtieth", to_ordinal.convert("60", true, false).unwrap()); + + assert_eq!("tenths", to_ordinal.convert("10", true, true).unwrap()); + assert_eq!("tenths", to_ordinal.convert("0010", true, true).unwrap()); + assert_eq!("elevenths", to_ordinal.convert("11", true, true).unwrap()); + assert_eq!("nineteenths", to_ordinal.convert("19", true, true).unwrap()); + assert_eq!("twentieths", to_ordinal.convert("20", true, true).unwrap()); + assert_eq!("nineteenths", to_ordinal.convert("𝟏𝟗", true, true).unwrap()); + }); } #[test] fn ordinal_three_digit() { init_word_list(); - assert_eq!("one hundred first", ToOrdinal::convert("101", false, false).unwrap()); - assert_eq!("two hundred tenth", ToOrdinal::convert("210", false, false).unwrap()); - assert_eq!("four hundred thirty second", ToOrdinal::convert("432", false, false).unwrap()); - assert_eq!("four hundred second", ToOrdinal::convert("402", false, false).unwrap()); - - assert_eq!("one hundred first", ToOrdinal::convert("101", true, false).unwrap()); - assert_eq!("two hundred second", ToOrdinal::convert("202", true, false).unwrap()); - assert_eq!("four hundred thirty second", ToOrdinal::convert("432", true, false).unwrap()); - assert_eq!("five hundred third", ToOrdinal::convert("503", true, false).unwrap()); - - assert_eq!("three hundred elevenths", ToOrdinal::convert("311", false, true).unwrap()); - assert_eq!("four hundred ninety ninths", ToOrdinal::convert("499", false, true).unwrap()); - assert_eq!("nine hundred ninetieths", ToOrdinal::convert("990", false, true).unwrap()); - assert_eq!("six hundred seconds", ToOrdinal::convert("602", false, true).unwrap()); - - assert_eq!("seven hundredths", ToOrdinal::convert("700", true, true).unwrap()); - assert_eq!("one hundredths", ToOrdinal::convert("100", true, true).unwrap()); - assert_eq!("eight hundred seventeenths", ToOrdinal::convert("817", true, true).unwrap()); + SPEECH_DEFINITIONS.with_borrow(|definitions| { + let pref_manager = PreferenceManager::get(); + let to_ordinal = ToOrdinal { pref_manager: &pref_manager.borrow(), definitions: definitions }; + + assert_eq!("one hundred first", to_ordinal.convert("101", false, false).unwrap()); + assert_eq!("two hundred tenth", to_ordinal.convert("210", false, false).unwrap()); + assert_eq!("four hundred thirty second", to_ordinal.convert("432", false, false).unwrap()); + assert_eq!("four hundred second", to_ordinal.convert("402", false, false).unwrap()); + + assert_eq!("one hundred first", to_ordinal.convert("101", true, false).unwrap()); + assert_eq!("two hundred second", to_ordinal.convert("202", true, false).unwrap()); + assert_eq!("four hundred thirty second", to_ordinal.convert("432", true, false).unwrap()); + assert_eq!("five hundred third", to_ordinal.convert("503", true, false).unwrap()); + + assert_eq!("three hundred elevenths", to_ordinal.convert("311", false, true).unwrap()); + assert_eq!("four hundred ninety ninths", to_ordinal.convert("499", false, true).unwrap()); + assert_eq!("nine hundred ninetieths", to_ordinal.convert("990", false, true).unwrap()); + assert_eq!("six hundred seconds", to_ordinal.convert("602", false, true).unwrap()); + + assert_eq!("seven hundredths", to_ordinal.convert("700", true, true).unwrap()); + assert_eq!("one hundredths", to_ordinal.convert("100", true, true).unwrap()); + assert_eq!("eight hundred seventeenths", to_ordinal.convert("817", true, true).unwrap()); + }); } #[test] fn ordinal_large() { init_word_list(); - assert_eq!("one thousandth", ToOrdinal::convert("1000", false, false).unwrap()); - assert_eq!("two thousand one hundredth", ToOrdinal::convert("2100", false, false).unwrap()); - assert_eq!("thirty thousandth", ToOrdinal::convert("30000", false, false).unwrap()); - assert_eq!("four hundred thousandth", ToOrdinal::convert("400000", false, false).unwrap()); - - assert_eq!("four hundred thousandth", ToOrdinal::convert("400000", true, false).unwrap()); - assert_eq!("five hundred thousand second", ToOrdinal::convert("500002", true, false).unwrap()); - assert_eq!("six millionth", ToOrdinal::convert("6000000", true, false).unwrap()); - assert_eq!("sixty millionth", ToOrdinal::convert("60000000", true, false).unwrap()); - - assert_eq!("seven billionths", ToOrdinal::convert("7000000000", false, true).unwrap()); - assert_eq!("eight trillionths", ToOrdinal::convert("8000000000000", false, true).unwrap()); - assert_eq!("nine quadrillionths", ToOrdinal::convert("9000000000000000", false, true).unwrap()); - assert_eq!("one quintillionth", ToOrdinal::convert("1000000000000000000", false, false).unwrap()); - - assert_eq!("nine billion eight hundred seventy six million five hundred forty three thousand two hundred tenths", ToOrdinal::convert("9876543210", true, true).unwrap()); - assert_eq!("nine billion five hundred forty three thousand two hundred tenths", ToOrdinal::convert("9000543210", true, true).unwrap()); - assert_eq!("zeroth", ToOrdinal::convert("00000", false, false).unwrap()); + SPEECH_DEFINITIONS.with_borrow(|definitions| { + let pref_manager = PreferenceManager::get(); + let to_ordinal = ToOrdinal { pref_manager: &pref_manager.borrow(), definitions: definitions }; + + assert_eq!("one thousandth", to_ordinal.convert("1000", false, false).unwrap()); + assert_eq!("two thousand one hundredth", to_ordinal.convert("2100", false, false).unwrap()); + assert_eq!("thirty thousandth", to_ordinal.convert("30000", false, false).unwrap()); + assert_eq!("four hundred thousandth", to_ordinal.convert("400000", false, false).unwrap()); + + assert_eq!("four hundred thousandth", to_ordinal.convert("400000", true, false).unwrap()); + assert_eq!("five hundred thousand second", to_ordinal.convert("500002", true, false).unwrap()); + assert_eq!("six millionth", to_ordinal.convert("6000000", true, false).unwrap()); + assert_eq!("sixty millionth", to_ordinal.convert("60000000", true, false).unwrap()); + + assert_eq!("seven billionths", to_ordinal.convert("7000000000", false, true).unwrap()); + assert_eq!("eight trillionths", to_ordinal.convert("8000000000000", false, true).unwrap()); + assert_eq!("nine quadrillionths", to_ordinal.convert("9000000000000000", false, true).unwrap()); + assert_eq!("one quintillionth", to_ordinal.convert("1000000000000000000", false, false).unwrap()); + + assert_eq!("nine billion eight hundred seventy six million five hundred forty three thousand two hundred tenths", to_ordinal.convert("9876543210", true, true).unwrap()); + assert_eq!("nine billion five hundred forty three thousand two hundred tenths", to_ordinal.convert("9000543210", true, true).unwrap()); + assert_eq!("zeroth", to_ordinal.convert("00000", false, false).unwrap()); + }); } @@ -1639,4 +1671,4 @@ mod tests { let mn = as_element(as_element(fraction.children()[1]).children()[0]); assert_eq!(EdgeNode::edge_node(mn, true, "2D"), None); } -} \ No newline at end of file +} diff --git a/tests/common/mod.rs b/tests/common/mod.rs index d35fc0ab..29cd6a12 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -5,6 +5,7 @@ use regex::Regex; extern crate lazy_static; use lazy_static::lazy_static; pub use libmathcat::interface::*; +use libmathcat::element_util::*; #[allow(dead_code)] pub fn init_logger() {