diff --git a/Src/xWorks/ConfiguredLcmGenerator.cs b/Src/xWorks/ConfiguredLcmGenerator.cs
index 7314a4e5b0..1c02b418c4 100644
--- a/Src/xWorks/ConfiguredLcmGenerator.cs
+++ b/Src/xWorks/ConfiguredLcmGenerator.cs
@@ -429,9 +429,14 @@ internal static IFragment GenerateContentForEntry(ICmObject entry, ConfigurableD
settings.ContentGenerator.EndEntry(xw);
xw.Flush();
- // Do not normalize the string if exporting to word doc--it is not needed and will cause loss of document styles
- if (bldr is LcmWordGenerator.DocFragment)
- return bldr;
+ // All content should be in NFC (LT-18177). For Word export we normalize the text inside the
+ // OpenXml runs in place: normalizing bldr.ToString() and rebuilding a fragment from the string
+ // (as done below for XHTML) would discard the document's run and paragraph styles.
+ if (bldr is LcmWordGenerator.DocFragment wordFragment)
+ {
+ wordFragment.NormalizeText(FwNormalizationMode.knmNFC);
+ return wordFragment;
+ }
return settings.ContentGenerator.CreateFragment(CustomIcu.GetIcuNormalizer(FwNormalizationMode.knmNFC).Normalize(bldr.ToString())); // All content should be in NFC (LT-18177)
}
diff --git a/Src/xWorks/LcmWordGenerator.cs b/Src/xWorks/LcmWordGenerator.cs
index e56f864370..7409637411 100644
--- a/Src/xWorks/LcmWordGenerator.cs
+++ b/Src/xWorks/LcmWordGenerator.cs
@@ -10,6 +10,7 @@
using SIL.FieldWorks.Common.FwUtils;
using SIL.FieldWorks.Common.Widgets;
using SIL.LCModel;
+using SIL.LCModel.Core.Text;
using SIL.LCModel.Core.WritingSystems;
using SIL.LCModel.DomainServices;
using SIL.LCModel.Utils;
@@ -424,6 +425,11 @@ internal static DocFragment GenerateLetterHeaderDocFragment(string str, string s
// Only create paragraph, run, and text objects if string is nonempty
if (!string.IsNullOrEmpty(str))
{
+ // All exported content should be NFC (LT-18177). The header letter derives from the NFD sort
+ // word (see ConfiguredLcmGenerator.GetSortWordForLetterHead) and its upper-cased form can be
+ // decomposed, so normalize before writing it into the run.
+ str = CustomIcu.GetIcuNormalizer(FwNormalizationMode.knmNFC).Normalize(str);
+
// Don't add this paragraph before the first letter header. It results in an extra blank line.
if (!firstHeader)
{
@@ -540,6 +546,27 @@ public int Length()
return str.Length;
}
+ ///
+ /// Normalizes the text in every run of the fragment to the given Unicode normalization form, in place.
+ /// FieldWorks keeps strings as NFD in memory, but all exported content should be NFC (LT-18177), to
+ /// match the XHTML/Webonary export. We normalize the text within the OpenXml text nodes rather than
+ /// round-tripping the fragment through a string so the document structure and styles are preserved.
+ ///
+ public void NormalizeText(FwNormalizationMode mode)
+ {
+ if (IsNullOrEmpty())
+ return;
+
+ var normalizer = CustomIcu.GetIcuNormalizer(mode);
+ foreach (WP.Text txt in DocBody.Descendants())
+ {
+ // Setting Text only changes the run's text value; the xml:space attribute and all
+ // run/paragraph properties (i.e. the styles) are left untouched.
+ if (!string.IsNullOrEmpty(txt.Text))
+ txt.Text = normalizer.Normalize(txt.Text);
+ }
+ }
+
///
/// Appends one doc fragment to another.
/// Use this if styles have already been applied.
diff --git a/Src/xWorks/xWorksTests/LcmWordGeneratorTests.cs b/Src/xWorks/xWorksTests/LcmWordGeneratorTests.cs
index b46922f29e..2e19af328a 100644
--- a/Src/xWorks/xWorksTests/LcmWordGeneratorTests.cs
+++ b/Src/xWorks/xWorksTests/LcmWordGeneratorTests.cs
@@ -18,6 +18,7 @@
using SIL.LCModel;
using SIL.LCModel.Core.KernelInterfaces;
using SIL.LCModel.Core.Text;
+using SIL.LCModel.Core.WritingSystems;
using SIL.LCModel.DomainServices;
using SIL.WritingSystems;
using SIL.TestUtilities;
@@ -599,6 +600,65 @@ public void GenerateWordDocForEntry_LineBreaksInBeforeContentWork()
WordNamespaceManager);
}
+ [Test]
+ public void GenerateWordDocForEntry_GeneratesNFC()
+ {
+ // FieldWorks stores strings as NFD (decomposed) in memory. All exported content should be NFC
+ // (LT-18177), matching the XHTML/Webonary export. This guards the Word export against emitting NFD.
+ var headwordNode = new ConfigurableDictionaryNode
+ {
+ FieldDescription = "MLHeadWord",
+ CSSClassNameOverride = "headword",
+ DictionaryNodeOptions = ConfiguredXHTMLGeneratorTests.GetWsOptionsForLanguages(new[] { "ko" }),
+ Style = "Dictionary-Headword"
+ };
+ var mainEntryNode = new ConfigurableDictionaryNode
+ {
+ FieldDescription = "LexEntry",
+ Children = new List { headwordNode },
+ Style = MainEntryParagraphStyleName
+ };
+ CssGeneratorTests.PopulateFieldsForTesting(mainEntryNode);
+
+ Cache.LangProject.AddToCurrentVernacularWritingSystems(Cache.WritingSystemFactory.get_Engine("ko") as CoreWritingSystemDefinition);
+ var wsKo = Cache.WritingSystemFactory.GetWsFromStr("ko");
+ var entry = Cache.ServiceLocator.GetInstance().Create();
+ // Decompose explicitly so the input is NFD regardless of how this source file stores the literal.
+ var nfdHeadword = CustomIcu.GetIcuNormalizer(FwNormalizationMode.knmNFD).Normalize("자ㄱㄴ시"); // Korean
+ entry.CitationForm.set_String(wsKo, TsStringUtils.MakeString(nfdHeadword, wsKo));
+ var storedHeadword = entry.CitationForm.get_String(wsKo);
+ Assert.That(storedHeadword.get_IsNormalizedForm(FwNormalizationMode.knmNFD), "Should be NFDecomposed in memory");
+ Assert.That(storedHeadword.Text.Length, Is.EqualTo(6), "NFD headword should have 6 codepoints");
+
+ //SUT
+ var result = ConfiguredLcmGenerator.GenerateContentForEntry(entry, mainEntryNode, null, DefaultSettings, 0) as DocFragment;
+ Assert.That(result, Is.Not.Null, "Results should have been generated");
+ var tsResult = TsStringUtils.MakeString(result.ToString(), wsKo);
+ Assert.That(TsStringUtils.IsNullOrEmpty(tsResult), Is.False, "Results should have been generated");
+ Assert.That(tsResult.get_IsNormalizedForm(FwNormalizationMode.knmNFC), "Resulting Word content should be NFComposed (NFC)");
+ }
+
+ [Test]
+ public void GenerateLetterHeaderDocFragment_GeneratesNFC()
+ {
+ // The letter header's letter derives from the NFD sort word (see GetSortWordForLetterHead) and its
+ // upper-cased form can be decomposed. All exported content should be NFC (LT-18177).
+ var wsString = Cache.WritingSystemFactory.GetStrFromWs(Cache.DefaultVernWs);
+ // Vietnamese "Ầ ầ" fully decomposed (A/a + combining circumflex + combining grave).
+ var nfdHeader = CustomIcu.GetIcuNormalizer(FwNormalizationMode.knmNFD).Normalize("Ầ ầ");
+ Assert.That(TsStringUtils.MakeString(nfdHeader, Cache.DefaultVernWs).get_IsNormalizedForm(FwNormalizationMode.knmNFD),
+ "Sanity check: the header input should be NFDecomposed");
+
+ //SUT
+ var headerFrag = DocFragment.GenerateLetterHeaderDocFragment(nfdHeader,
+ WordStylesGenerator.LetterHeadingDisplayName, false, wsString);
+
+ var tsResult = TsStringUtils.MakeString(headerFrag.ToString(), Cache.DefaultVernWs);
+ Assert.That(TsStringUtils.IsNullOrEmpty(tsResult), Is.False, "Header should have been generated");
+ Assert.That(tsResult.get_IsNormalizedForm(FwNormalizationMode.knmNFC),
+ "Letter header content should be NFComposed (NFC)");
+ }
+
[Test]
public void GenerateUniqueStyleName()
{