Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions Src/xWorks/ConfiguredLcmGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -429,9 +429,14 @@ internal static IFragment GenerateContentForEntry(ICmObject entry, ConfigurableD
settings.ContentGenerator.EndEntry(xw);
xw.Flush();

// Do not normalize the string if exporting to word doc--it is not needed and will cause loss of document styles
if (bldr is LcmWordGenerator.DocFragment)
return bldr;
// All content should be in NFC (LT-18177). For Word export we normalize the text inside the
// OpenXml runs in place: normalizing bldr.ToString() and rebuilding a fragment from the string
// (as done below for XHTML) would discard the document's run and paragraph styles.
if (bldr is LcmWordGenerator.DocFragment wordFragment)
{
wordFragment.NormalizeText(FwNormalizationMode.knmNFC);
return wordFragment;
}

return settings.ContentGenerator.CreateFragment(CustomIcu.GetIcuNormalizer(FwNormalizationMode.knmNFC).Normalize(bldr.ToString())); // All content should be in NFC (LT-18177)
}
Expand Down
27 changes: 27 additions & 0 deletions Src/xWorks/LcmWordGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
using SIL.FieldWorks.Common.FwUtils;
using SIL.FieldWorks.Common.Widgets;
using SIL.LCModel;
using SIL.LCModel.Core.Text;
using SIL.LCModel.Core.WritingSystems;
using SIL.LCModel.DomainServices;
using SIL.LCModel.Utils;
Expand Down Expand Up @@ -424,6 +425,11 @@ internal static DocFragment GenerateLetterHeaderDocFragment(string str, string s
// Only create paragraph, run, and text objects if string is nonempty
if (!string.IsNullOrEmpty(str))
{
// All exported content should be NFC (LT-18177). The header letter derives from the NFD sort
// word (see ConfiguredLcmGenerator.GetSortWordForLetterHead) and its upper-cased form can be
// decomposed, so normalize before writing it into the run.
str = CustomIcu.GetIcuNormalizer(FwNormalizationMode.knmNFC).Normalize(str);

// Don't add this paragraph before the first letter header. It results in an extra blank line.
if (!firstHeader)
{
Expand Down Expand Up @@ -540,6 +546,27 @@ public int Length()
return str.Length;
}

/// <summary>
/// Normalizes the text in every run of the fragment to the given Unicode normalization form, in place.
/// FieldWorks keeps strings as NFD in memory, but all exported content should be NFC (LT-18177), to
/// match the XHTML/Webonary export. We normalize the text within the OpenXml text nodes rather than
/// round-tripping the fragment through a string so the document structure and styles are preserved.
/// </summary>
public void NormalizeText(FwNormalizationMode mode)
{
if (IsNullOrEmpty())
return;

var normalizer = CustomIcu.GetIcuNormalizer(mode);
foreach (WP.Text txt in DocBody.Descendants<WP.Text>())
{
// Setting Text only changes the run's text value; the xml:space attribute and all
// run/paragraph properties (i.e. the styles) are left untouched.
if (!string.IsNullOrEmpty(txt.Text))
txt.Text = normalizer.Normalize(txt.Text);
}
}

/// <summary>
/// Appends one doc fragment to another.
/// Use this if styles have already been applied.
Expand Down
60 changes: 60 additions & 0 deletions Src/xWorks/xWorksTests/LcmWordGeneratorTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
using SIL.LCModel;
using SIL.LCModel.Core.KernelInterfaces;
using SIL.LCModel.Core.Text;
using SIL.LCModel.Core.WritingSystems;
using SIL.LCModel.DomainServices;
using SIL.WritingSystems;
using SIL.TestUtilities;
Expand Down Expand Up @@ -599,6 +600,65 @@ public void GenerateWordDocForEntry_LineBreaksInBeforeContentWork()
WordNamespaceManager);
}

[Test]
public void GenerateWordDocForEntry_GeneratesNFC()
{
// FieldWorks stores strings as NFD (decomposed) in memory. All exported content should be NFC
// (LT-18177), matching the XHTML/Webonary export. This guards the Word export against emitting NFD.
var headwordNode = new ConfigurableDictionaryNode
{
FieldDescription = "MLHeadWord",
CSSClassNameOverride = "headword",
DictionaryNodeOptions = ConfiguredXHTMLGeneratorTests.GetWsOptionsForLanguages(new[] { "ko" }),
Style = "Dictionary-Headword"
};
var mainEntryNode = new ConfigurableDictionaryNode
{
FieldDescription = "LexEntry",
Children = new List<ConfigurableDictionaryNode> { headwordNode },
Style = MainEntryParagraphStyleName
};
CssGeneratorTests.PopulateFieldsForTesting(mainEntryNode);

Cache.LangProject.AddToCurrentVernacularWritingSystems(Cache.WritingSystemFactory.get_Engine("ko") as CoreWritingSystemDefinition);
var wsKo = Cache.WritingSystemFactory.GetWsFromStr("ko");
var entry = Cache.ServiceLocator.GetInstance<ILexEntryFactory>().Create();
// Decompose explicitly so the input is NFD regardless of how this source file stores the literal.
var nfdHeadword = CustomIcu.GetIcuNormalizer(FwNormalizationMode.knmNFD).Normalize("자ㄱㄴ시"); // Korean
entry.CitationForm.set_String(wsKo, TsStringUtils.MakeString(nfdHeadword, wsKo));
var storedHeadword = entry.CitationForm.get_String(wsKo);
Assert.That(storedHeadword.get_IsNormalizedForm(FwNormalizationMode.knmNFD), "Should be NFDecomposed in memory");
Assert.That(storedHeadword.Text.Length, Is.EqualTo(6), "NFD headword should have 6 codepoints");

//SUT
var result = ConfiguredLcmGenerator.GenerateContentForEntry(entry, mainEntryNode, null, DefaultSettings, 0) as DocFragment;
Assert.That(result, Is.Not.Null, "Results should have been generated");
var tsResult = TsStringUtils.MakeString(result.ToString(), wsKo);

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a reason we convert it into a ts string before testing?

Assert.That(TsStringUtils.IsNullOrEmpty(tsResult), Is.False, "Results should have been generated");
Assert.That(tsResult.get_IsNormalizedForm(FwNormalizationMode.knmNFC), "Resulting Word content should be NFComposed (NFC)");
}

[Test]
public void GenerateLetterHeaderDocFragment_GeneratesNFC()
{
// The letter header's letter derives from the NFD sort word (see GetSortWordForLetterHead) and its
// upper-cased form can be decomposed. All exported content should be NFC (LT-18177).
var wsString = Cache.WritingSystemFactory.GetStrFromWs(Cache.DefaultVernWs);
// Vietnamese "Ầ ầ" fully decomposed (A/a + combining circumflex + combining grave).
var nfdHeader = CustomIcu.GetIcuNormalizer(FwNormalizationMode.knmNFD).Normalize("Ầ ầ");
Assert.That(TsStringUtils.MakeString(nfdHeader, Cache.DefaultVernWs).get_IsNormalizedForm(FwNormalizationMode.knmNFD),
"Sanity check: the header input should be NFDecomposed");

//SUT
var headerFrag = DocFragment.GenerateLetterHeaderDocFragment(nfdHeader,
WordStylesGenerator.LetterHeadingDisplayName, false, wsString);

var tsResult = TsStringUtils.MakeString(headerFrag.ToString(), Cache.DefaultVernWs);
Assert.That(TsStringUtils.IsNullOrEmpty(tsResult), Is.False, "Header should have been generated");
Assert.That(tsResult.get_IsNormalizedForm(FwNormalizationMode.knmNFC),
"Letter header content should be NFComposed (NFC)");
}

[Test]
public void GenerateUniqueStyleName()
{
Expand Down
Loading