Skip to content

Commit

Permalink
implementing solution for text consistency checks (proycon/folia#92) …
Browse files Browse the repository at this point in the history
…(not done yet, have to address the logical in parseXml still)
  • Loading branch information
proycon committed Feb 3, 2021
1 parent 654f441 commit d951889
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 45 deletions.
42 changes: 28 additions & 14 deletions include/libfolia/folia_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,12 @@ namespace folia {
The default is NOT STRICT, meaning to get text from deeper
textcontent nodes too. (stopping at the first that HAS text)
*/
HIDDEN=4 //!< Include text from 'hidden' nodes.
HIDDEN=4, //!< Include text from 'hidden' nodes.
NO_TRIM_SPACES=8 /*!< Do not trim leading and trailing spaces (was the default
prior to FoLiA v2.4.1, see
https://github.com/proycon/folia/issues/92
*/

};

inline TEXT_FLAGS operator&( TEXT_FLAGS f1, TEXT_FLAGS f2 ){
Expand Down Expand Up @@ -319,7 +324,7 @@ namespace folia {
// text/string content
bool hastext( const std::string& = "current" ) const;
bool hasphon( const std::string& = "current" ) const;
virtual void check_text_consistency() const = 0;
virtual void check_text_consistency(bool = true) const = 0;
virtual void check_append_text_consistency( const FoliaElement * ) const = 0;

virtual const std::string str( const std::string& = "current" ) const = 0;
Expand All @@ -328,12 +333,13 @@ namespace folia {
virtual const UnicodeString private_text( const std::string& = "current",
bool = false,
bool = false,
bool = false ) const = 0;
bool = false,
bool = true) const = 0;
virtual const UnicodeString text( const std::string&,
TEXT_FLAGS = TEXT_FLAGS::NONE ) const = 0;
virtual const UnicodeString text( TEXT_FLAGS = TEXT_FLAGS::NONE ) const = 0;
const UnicodeString stricttext( const std::string& = "current" ) const;
const UnicodeString toktext( const std::string& = "current" ) const;
const UnicodeString stricttext( const std::string& = "current", bool = true ) const;
const UnicodeString toktext( const std::string& = "current", bool = true ) const;
virtual const UnicodeString phon( const std::string&,
TEXT_FLAGS = TEXT_FLAGS::NONE ) const = 0;
virtual const UnicodeString phon( TEXT_FLAGS = TEXT_FLAGS::NONE ) const = 0;
Expand Down Expand Up @@ -662,7 +668,8 @@ namespace folia {
const UnicodeString private_text( const std::string& = "current",
bool = false,
bool = false,
bool = false ) const;
bool = false,
bool = true) const;
const UnicodeString text( const std::string&,
TEXT_FLAGS = TEXT_FLAGS::NONE ) const;
const UnicodeString text( TEXT_FLAGS flags = TEXT_FLAGS::NONE ) const {
Expand Down Expand Up @@ -768,7 +775,7 @@ namespace folia {
bool addable( const FoliaElement * ) const;
void setDateTime( const std::string& );
const std::string getDateTime() const;
void check_text_consistency() const;
void check_text_consistency(bool = true) const;
void check_append_text_consistency( const FoliaElement * ) const;
void check_declaration();
private:
Expand Down Expand Up @@ -1238,7 +1245,8 @@ namespace folia {
const UnicodeString private_text( const std::string& = "current",
bool = false,
bool = false,
bool = false ) const;
bool = false,
bool = true) const;
static properties PROPS;
std::string _original;
};
Expand Down Expand Up @@ -1386,7 +1394,8 @@ namespace folia {
const UnicodeString private_text( const std::string& = "current",
bool = false,
bool = false,
bool = false ) const;
bool = false,
bool = true) const;
static properties PROPS;

};
Expand Down Expand Up @@ -1531,7 +1540,8 @@ namespace folia {
const UnicodeString private_text( const std::string& = "current",
bool = false,
bool = false,
bool = false ) const {
bool = false,
bool = true) const {
return "\n";
}
static properties PROPS;
Expand All @@ -1553,7 +1563,8 @@ namespace folia {
const UnicodeString private_text( const std::string& = "current",
bool = false,
bool = false,
bool = false ) const {
bool = false,
bool = true) const {
return "\n\n";
}
static properties PROPS;
Expand Down Expand Up @@ -2585,7 +2596,8 @@ namespace folia {
const UnicodeString private_text( const std::string& = "current",
bool = false,
bool = false,
bool = false ) const { return ""; };
bool = false,
bool = true) const { return ""; };
static properties PROPS;
std::string _value;
};
Expand All @@ -2606,7 +2618,8 @@ namespace folia {
const UnicodeString private_text( const std::string& = "current",
bool = false,
bool = false,
bool = false ) const;
bool = false,
bool = true) const;
static properties PROPS;
std::string _value; //UTF8 value
};
Expand Down Expand Up @@ -2745,7 +2758,8 @@ namespace folia {
const UnicodeString private_text( const std::string& = "current",
bool = false,
bool = false,
bool = false ) const;
bool = false,
bool = true) const;
static properties PROPS;
};

Expand Down
99 changes: 68 additions & 31 deletions src/folia_impl.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -1233,7 +1233,7 @@ namespace folia {
}
}

void AbstractElement::check_text_consistency( ) const {
void AbstractElement::check_text_consistency( bool trim_spaces ) const {
/// check the text consistency of the combined text of the children
/// against the text of the Element.
/*!
Expand All @@ -1260,8 +1260,12 @@ namespace folia {
&& parent->hastext( cls ) ){
// check text consistency for parents with text
// but SKIP Corrections
UnicodeString s1 = parent->text( cls, TEXT_FLAGS::STRICT );
UnicodeString s2 = this->text( cls, TEXT_FLAGS::NONE );
TEXT_FLAGS flags = TEXT_FLAGS::STRICT;
if (!trim_spaces) flags |= TEXT_FLAGS::NO_TRIM_SPACES;
UnicodeString s1 = parent->text( cls, flags);
flags = TEXT_FLAGS::NONE;
if (!trim_spaces) flags |= TEXT_FLAGS::NO_TRIM_SPACES;
UnicodeString s2 = this->text( cls, flags );
// no retain tokenization, strict for parent, deeper for child
s1 = normalize_spaces( s1 );
s2 = normalize_spaces( s2 );
Expand All @@ -1278,14 +1282,31 @@ namespace folia {
test_fail = ( s1 != s2 );
}
if ( test_fail ){
throw InconsistentText( "text (class="
+ cls + ") from node: " + xmltag()
+ "(" + id() + ")"
+ " with value\n'" + TiCC::UnicodeToUTF8(s2)
+ "'\n to element: " + parent->xmltag() +
+ "(" + parent->id() + ") which already has "
+ "text in that class and value: \n'"
+ TiCC::UnicodeToUTF8(s1) + "'\n" );
bool warn_only = false;
if (trim_spaces) {
//ok, we failed according to the >v2.4.1 rules
//but do we also fail under the old rules?
try {
this->check_text_consistency(false);
warn_only = true;
} catch ( const InconsistentText& ) {
//ignore, we raise the newer error
}
}
string msg = "text (class="
+ cls + ") from node: " + xmltag()
+ "(" + id() + ")"
+ " with value\n'" + TiCC::UnicodeToUTF8(s2)
+ "'\n to element: " + parent->xmltag() +
+ "(" + parent->id() + ") which already has "
+ "text in that class and value: \n'"
+ TiCC::UnicodeToUTF8(s1) + "'\n";
if (warn_only) {
msg += "However, according to the older rules (<v2.4.1) the text is consistent. So we are treating this as a warning rather than an error. We do recommend fixing this if this is a document you intend to publish.\n";
cerr << "WARNING: inconsistent text: " << msg << endl;
} else {
throw InconsistentText(msg);
}
}
}
}
Expand Down Expand Up @@ -1555,14 +1576,17 @@ namespace folia {
const UnicodeString AbstractElement::private_text( const string& cls,
bool retaintok,
bool strict,
bool show_hidden ) const {
bool show_hidden,
bool trim_spaces ) const {

/// get the UnicodeString value of an element
/*!
* \param cls The textclass we are looking for
* \param retaintok retain the tokenisation information
* \param strict If true, return the text of this level only
* when false, allow recursing into children
* \param show_hidden include text form 'hidden' nodes too.
* \param trim_spaces Trim leading and trailing spaces (defaults to True since FoLiA v2.4.1)
* \return the Unicode String representation found. Throws when
* no text can be found
*/
Expand Down Expand Up @@ -1593,11 +1617,11 @@ namespace folia {
int i = 0;
for ( const auto& d : _data ){
if (d->isinstance( XmlText_t)) {
if ((i == 0) && (i == (int) _data.size() -1)) {
if ((trim_spaces) && (i == 0) && (i == (int) _data.size() -1)) {
result += rtrim(ltrim(d->text( cls )));
} else if (i == 0) {
} else if ((trim_spaces) && (i == 0)) {
result += ltrim(d->text( cls ));
} else if (i == (int) _data.size() - 1) {
} else if ((trim_spaces) && (i == (int) _data.size() - 1)) {
result += rtrim(d->text( cls ));
} else {
result += d->text( cls );
Expand All @@ -1607,7 +1631,9 @@ namespace folia {
const string& delim = d->get_delimiter( retaintok );
result += TiCC::UnicodeFromUTF8(delim);
}
result += d->text( cls );
TEXT_FLAGS flags = TEXT_FLAGS::NONE;
if (!trim_spaces) flags |= TEXT_FLAGS::NO_TRIM_SPACES;
result += d->text( cls, flags );
}
i++;
}
Expand All @@ -1628,9 +1654,12 @@ namespace folia {
if ( show_hidden ){
flags |= TEXT_FLAGS::HIDDEN;
}
if (!trim_spaces) flags |= TEXT_FLAGS::NO_TRIM_SPACES;
UnicodeString result = deeptext( cls, flags );
if ( result.isEmpty() ) {
result = stricttext( cls );
TEXT_FLAGS flags = TEXT_FLAGS::NONE;
if (!trim_spaces) flags |= TEXT_FLAGS::NO_TRIM_SPACES;
result = stricttext( cls, trim_spaces );
}
if ( result.isEmpty() ) {
throw NoSuchText( "on tag " + xmltag() + " nor it's children" );
Expand All @@ -1649,7 +1678,8 @@ namespace folia {
bool retain = ( TEXT_FLAGS::RETAIN & flags ) == TEXT_FLAGS::RETAIN;
bool strict = ( TEXT_FLAGS::STRICT & flags ) == TEXT_FLAGS::STRICT;
bool hidden = ( TEXT_FLAGS::HIDDEN & flags ) == TEXT_FLAGS::HIDDEN;
return private_text( cls, retain, strict, hidden );
bool trim_spaces = !( ( TEXT_FLAGS::NO_TRIM_SPACES & flags ) == TEXT_FLAGS::NO_TRIM_SPACES);
return private_text( cls, retain, strict, hidden, trim_spaces );
}

void FoLiA::setAttributes( KWargs& kwargs ){
Expand Down Expand Up @@ -1737,13 +1767,15 @@ namespace folia {
const UnicodeString FoLiA::private_text( const string& cls,
bool retaintok,
bool strict,
bool ) const {
bool,
bool trim_spaces) const {
/// get the UnicodeString value of a FoLiA topnode
/*!
* \param cls The textclass we are looking for
* \param retaintok retain the tokenisation information
* \param strict If true, return the text of the direct children only
* when false, allow recursing into children
* \param trim_spaces Trim leading and trailing spaces (defaults to True since FoLiA v2.4.1)
* \return the Unicode String representation found. Throws when
* no text can be found
*/
Expand All @@ -1757,7 +1789,7 @@ namespace folia {
const string& delim = d->get_delimiter( retaintok );
result += TiCC::UnicodeFromUTF8(delim);
}
result += d->private_text( cls, retaintok, strict, false );
result += d->private_text( cls, retaintok, strict, false, trim_spaces );
}
#ifdef DEBUG_TEXT
cerr << "FoLiA::TEXT returns '" << result << "'" << endl;
Expand Down Expand Up @@ -2000,25 +2032,29 @@ namespace folia {
return result;
}

const UnicodeString FoliaElement::stricttext( const string& cls ) const {
const UnicodeString FoliaElement::stricttext( const string& cls, bool trim_spaces ) const {
/// get the UnicodeString value of TextContent children only
/*!
* \param cls the textclass
* \return The Unicode Text found.
* Will throw on error.
*/
return this->text(cls, TEXT_FLAGS::STRICT );
TEXT_FLAGS flags = TEXT_FLAGS::STRICT;
if (!trim_spaces) flags |= TEXT_FLAGS::NO_TRIM_SPACES;
return this->text(cls, flags );
}

const UnicodeString FoliaElement::toktext( const string& cls ) const {
const UnicodeString FoliaElement::toktext( const string& cls, bool trim_spaces ) const {
/// get the UnicodeString value of TextContent children only, retaining
/// tokenization
/*!
* \param cls the textclass
* \return The Unicode Text found.
* Will throw on error.
*/
return this->text(cls, TEXT_FLAGS::RETAIN );
TEXT_FLAGS flags = TEXT_FLAGS::RETAIN;
if (!trim_spaces) flags |= TEXT_FLAGS::NO_TRIM_SPACES;
return this->text(cls, flags );
}

const TextContent *AbstractElement::text_content( const string& cls,
Expand Down Expand Up @@ -5521,7 +5557,7 @@ namespace folia {
//#define DEBUG_TEXT
const UnicodeString Correction::private_text( const string& cls,
bool retaintok,
bool, bool ) const {
bool, bool, bool trim_spaces ) const {
/// get the UnicodeString value of an Correction
/*!
* \param cls The textclass we are looking for
Expand All @@ -5548,7 +5584,7 @@ namespace folia {
}
else {
try {
new_result = el->private_text( cls, retaintok, false, false );
new_result = el->private_text( cls, retaintok, false, false, trim_spaces );
}
catch ( ... ){
// try other nodes
Expand All @@ -5557,15 +5593,15 @@ namespace folia {
}
else if ( el->isinstance( Original_t ) ){
try {
org_result = el->private_text( cls, retaintok, false, false );
org_result = el->private_text( cls, retaintok, false, false, trim_spaces );
}
catch ( ... ){
// try other nodes
}
}
else if ( el->isinstance( Current_t ) ){
try {
cur_result = el->private_text( cls, retaintok, false, false );
cur_result = el->private_text( cls, retaintok, false, false, trim_spaces );
}
catch ( ... ){
// try other nodes
Expand Down Expand Up @@ -6003,7 +6039,7 @@ namespace folia {
return true;
}

const UnicodeString XmlText::private_text( const string&, bool, bool, bool ) const {
const UnicodeString XmlText::private_text( const string&, bool, bool, bool, bool ) const {
/// get the UnicodeString value of an XmlText element
/*!
*/
Expand Down Expand Up @@ -6506,7 +6542,8 @@ namespace folia {
const UnicodeString TextMarkupCorrection::private_text( const string& cls,
bool retaintok,
bool strict,
bool show_hidden ) const{
bool show_hidden,
bool trim_spaces ) const{
/// get the UnicodeString value of a TextMarkupCorrection element
/*!
* \param cls The textclass we are looking for
Expand All @@ -6522,7 +6559,7 @@ namespace folia {
if ( cls == "original" ) {
return TiCC::UnicodeFromUTF8(_original);
}
return AbstractElement::private_text( cls, retaintok, strict, show_hidden );
return AbstractElement::private_text( cls, retaintok, strict, show_hidden, trim_spaces );
}

const FoliaElement* AbstractTextMarkup::resolveid() const {
Expand Down

0 comments on commit d951889

Please sign in to comment.