diff --git a/Src/xWorks/ConfiguredLcmGenerator.cs b/Src/xWorks/ConfiguredLcmGenerator.cs index 7314a4e5b0..1c02b418c4 100644 --- a/Src/xWorks/ConfiguredLcmGenerator.cs +++ b/Src/xWorks/ConfiguredLcmGenerator.cs @@ -429,9 +429,14 @@ internal static IFragment GenerateContentForEntry(ICmObject entry, ConfigurableD settings.ContentGenerator.EndEntry(xw); xw.Flush(); - // Do not normalize the string if exporting to word doc--it is not needed and will cause loss of document styles - if (bldr is LcmWordGenerator.DocFragment) - return bldr; + // All content should be in NFC (LT-18177). For Word export we normalize the text inside the + // OpenXml runs in place: normalizing bldr.ToString() and rebuilding a fragment from the string + // (as done below for XHTML) would discard the document's run and paragraph styles. + if (bldr is LcmWordGenerator.DocFragment wordFragment) + { + wordFragment.NormalizeText(FwNormalizationMode.knmNFC); + return wordFragment; + } return settings.ContentGenerator.CreateFragment(CustomIcu.GetIcuNormalizer(FwNormalizationMode.knmNFC).Normalize(bldr.ToString())); // All content should be in NFC (LT-18177) } diff --git a/Src/xWorks/LcmWordGenerator.cs b/Src/xWorks/LcmWordGenerator.cs index e56f864370..7409637411 100644 --- a/Src/xWorks/LcmWordGenerator.cs +++ b/Src/xWorks/LcmWordGenerator.cs @@ -10,6 +10,7 @@ using SIL.FieldWorks.Common.FwUtils; using SIL.FieldWorks.Common.Widgets; using SIL.LCModel; +using SIL.LCModel.Core.Text; using SIL.LCModel.Core.WritingSystems; using SIL.LCModel.DomainServices; using SIL.LCModel.Utils; @@ -424,6 +425,11 @@ internal static DocFragment GenerateLetterHeaderDocFragment(string str, string s // Only create paragraph, run, and text objects if string is nonempty if (!string.IsNullOrEmpty(str)) { + // All exported content should be NFC (LT-18177). The header letter derives from the NFD sort + // word (see ConfiguredLcmGenerator.GetSortWordForLetterHead) and its upper-cased form can be + // decomposed, so normalize before writing it into the run. + str = CustomIcu.GetIcuNormalizer(FwNormalizationMode.knmNFC).Normalize(str); + // Don't add this paragraph before the first letter header. It results in an extra blank line. if (!firstHeader) { @@ -540,6 +546,27 @@ public int Length() return str.Length; } + /// + /// Normalizes the text in every run of the fragment to the given Unicode normalization form, in place. + /// FieldWorks keeps strings as NFD in memory, but all exported content should be NFC (LT-18177), to + /// match the XHTML/Webonary export. We normalize the text within the OpenXml text nodes rather than + /// round-tripping the fragment through a string so the document structure and styles are preserved. + /// + public void NormalizeText(FwNormalizationMode mode) + { + if (IsNullOrEmpty()) + return; + + var normalizer = CustomIcu.GetIcuNormalizer(mode); + foreach (WP.Text txt in DocBody.Descendants()) + { + // Setting Text only changes the run's text value; the xml:space attribute and all + // run/paragraph properties (i.e. the styles) are left untouched. + if (!string.IsNullOrEmpty(txt.Text)) + txt.Text = normalizer.Normalize(txt.Text); + } + } + /// /// Appends one doc fragment to another. /// Use this if styles have already been applied. diff --git a/Src/xWorks/xWorksTests/LcmWordGeneratorTests.cs b/Src/xWorks/xWorksTests/LcmWordGeneratorTests.cs index b46922f29e..2e19af328a 100644 --- a/Src/xWorks/xWorksTests/LcmWordGeneratorTests.cs +++ b/Src/xWorks/xWorksTests/LcmWordGeneratorTests.cs @@ -18,6 +18,7 @@ using SIL.LCModel; using SIL.LCModel.Core.KernelInterfaces; using SIL.LCModel.Core.Text; +using SIL.LCModel.Core.WritingSystems; using SIL.LCModel.DomainServices; using SIL.WritingSystems; using SIL.TestUtilities; @@ -599,6 +600,65 @@ public void GenerateWordDocForEntry_LineBreaksInBeforeContentWork() WordNamespaceManager); } + [Test] + public void GenerateWordDocForEntry_GeneratesNFC() + { + // FieldWorks stores strings as NFD (decomposed) in memory. All exported content should be NFC + // (LT-18177), matching the XHTML/Webonary export. This guards the Word export against emitting NFD. + var headwordNode = new ConfigurableDictionaryNode + { + FieldDescription = "MLHeadWord", + CSSClassNameOverride = "headword", + DictionaryNodeOptions = ConfiguredXHTMLGeneratorTests.GetWsOptionsForLanguages(new[] { "ko" }), + Style = "Dictionary-Headword" + }; + var mainEntryNode = new ConfigurableDictionaryNode + { + FieldDescription = "LexEntry", + Children = new List { headwordNode }, + Style = MainEntryParagraphStyleName + }; + CssGeneratorTests.PopulateFieldsForTesting(mainEntryNode); + + Cache.LangProject.AddToCurrentVernacularWritingSystems(Cache.WritingSystemFactory.get_Engine("ko") as CoreWritingSystemDefinition); + var wsKo = Cache.WritingSystemFactory.GetWsFromStr("ko"); + var entry = Cache.ServiceLocator.GetInstance().Create(); + // Decompose explicitly so the input is NFD regardless of how this source file stores the literal. + var nfdHeadword = CustomIcu.GetIcuNormalizer(FwNormalizationMode.knmNFD).Normalize("자ㄱㄴ시"); // Korean + entry.CitationForm.set_String(wsKo, TsStringUtils.MakeString(nfdHeadword, wsKo)); + var storedHeadword = entry.CitationForm.get_String(wsKo); + Assert.That(storedHeadword.get_IsNormalizedForm(FwNormalizationMode.knmNFD), "Should be NFDecomposed in memory"); + Assert.That(storedHeadword.Text.Length, Is.EqualTo(6), "NFD headword should have 6 codepoints"); + + //SUT + var result = ConfiguredLcmGenerator.GenerateContentForEntry(entry, mainEntryNode, null, DefaultSettings, 0) as DocFragment; + Assert.That(result, Is.Not.Null, "Results should have been generated"); + var tsResult = TsStringUtils.MakeString(result.ToString(), wsKo); + Assert.That(TsStringUtils.IsNullOrEmpty(tsResult), Is.False, "Results should have been generated"); + Assert.That(tsResult.get_IsNormalizedForm(FwNormalizationMode.knmNFC), "Resulting Word content should be NFComposed (NFC)"); + } + + [Test] + public void GenerateLetterHeaderDocFragment_GeneratesNFC() + { + // The letter header's letter derives from the NFD sort word (see GetSortWordForLetterHead) and its + // upper-cased form can be decomposed. All exported content should be NFC (LT-18177). + var wsString = Cache.WritingSystemFactory.GetStrFromWs(Cache.DefaultVernWs); + // Vietnamese "Ầ ầ" fully decomposed (A/a + combining circumflex + combining grave). + var nfdHeader = CustomIcu.GetIcuNormalizer(FwNormalizationMode.knmNFD).Normalize("Ầ ầ"); + Assert.That(TsStringUtils.MakeString(nfdHeader, Cache.DefaultVernWs).get_IsNormalizedForm(FwNormalizationMode.knmNFD), + "Sanity check: the header input should be NFDecomposed"); + + //SUT + var headerFrag = DocFragment.GenerateLetterHeaderDocFragment(nfdHeader, + WordStylesGenerator.LetterHeadingDisplayName, false, wsString); + + var tsResult = TsStringUtils.MakeString(headerFrag.ToString(), Cache.DefaultVernWs); + Assert.That(TsStringUtils.IsNullOrEmpty(tsResult), Is.False, "Header should have been generated"); + Assert.That(tsResult.get_IsNormalizedForm(FwNormalizationMode.knmNFC), + "Letter header content should be NFComposed (NFC)"); + } + [Test] public void GenerateUniqueStyleName() {