Skip to content

fix: ensure nlp/sentencize handles punctuation in quotation marks properly #5381

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 31 additions & 6 deletions lib/node_modules/@stdlib/nlp/sentencize/lib/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,13 @@ var trim = require( '@stdlib/string/base/trim' );

// VARIABLES //

var RE_LOWERCASE = /^[a-z]+$/;
var RE_CAPITALIZED = /^[A-Z][a-z]{0,4}$/;
var RE_CAPITALIZED_PERIOD = /^([A-Z]\.)*[A-Z]$/;
var RE_NUMBER = /^[0-9]$/;
var RE_PREFIXES = /^[{[(<:;"'”`]/;
var RE_SUFFIXES = /[})\]>:;"'”`]$/;
var RE_QUOTES = /^["'`]$/;


// FUNCTIONS //
Expand All @@ -51,10 +53,38 @@ var RE_SUFFIXES = /[})\]>:;"'”`]$/;
* @returns {boolean} boolean indicating whether the token at a specified index is an end-of-sentence token
*/
function isEndOfSentence( tokens, i ) {
var nextToken;
var token;
var im1 = i - 1;
var ip1 = i + 1;

token = tokens[ i ];

// Handle quoted text with punctuation...
if (
RE_QUOTES.test( token ) &&
i > 0 &&
( tokens[ im1 ] === '.' || tokens[ im1 ] === '!' || tokens[ im1 ] === '?' )
) {
// Look ahead to see if sentence continues:
ip1 = i + 1;
if ( ip1 < tokens.length ) {
// Skip spaces...
while ( ip1 < tokens.length && tokens[ ip1 ] === ' ' ) {
ip1 += 1;
}
// If next non-space token is lowercase, we assume the sentence continues:
if ( ip1 < tokens.length ) {
nextToken = tokens[ ip1 ];
if ( RE_LOWERCASE.test( nextToken ) ) {
return false;
}
}
}
return true;
}

// Regular sentence ending punctuation...
if (
token === '.' &&
!RE_CAPITALIZED.test( tokens[ im1 ] ) && // for other short abbreviations and bullet points
Expand All @@ -73,12 +103,6 @@ function isEndOfSentence( tokens, i ) {
) {
return true;
}
if (
RE_SUFFIXES.test( token ) &&
( tokens[ im1 ] === '.' || tokens[ im1 ] === '!' || tokens[ im1 ] === '?' )
) {
return true;
}
return false;
}

Expand Down Expand Up @@ -112,6 +136,7 @@ function sentencize( str ) {
var tokens;
var out;
var i;

if ( !isString( str ) ) {
throw new TypeError( 'invalid argument. Must provide a string. Value: `' + str + '`.' );
}
Expand Down
38 changes: 38 additions & 0 deletions lib/node_modules/@stdlib/nlp/sentencize/test/test.js
Original file line number Diff line number Diff line change
Expand Up @@ -317,3 +317,41 @@ tape( 'the function returns an empty array if provided an empty string', functio
t.equal( out.length, 0, 'array length is zero' );
t.end();
});

tape( 'the function correctly handles punctuation within quotation marks', function test( t ) {
var expected;
var actual;
var str;

str = 'I said "Look out" right before he banged his head.';
expected = [ 'I said "Look out" right before he banged his head.' ];
actual = sentencize( str );
t.deepEqual( actual, expected, 'keeps sentence with simple quotes together' );

str = 'I said "Look out!" right before he banged his head.';
expected = [ 'I said "Look out!" right before he banged his head.' ];
actual = sentencize( str );
t.deepEqual( actual, expected, 'keeps sentence with exclamation in quotes together' );

str = 'He asked "What time is it?" before leaving.';
expected = [ 'He asked "What time is it?" before leaving.' ];
actual = sentencize( str );
t.deepEqual( actual, expected, 'keeps sentence with question mark in quotes together' );

str = '"Stop!" he yelled. "We need to think about this."';
expected = [ '"Stop!" he yelled.', '"We need to think about this."' ];
actual = sentencize( str );
t.deepEqual( actual, expected, 'correctly splits multiple quoted sentences' );

str = 'She said "This is great!" and smiled.';
expected = [ 'She said "This is great!" and smiled.' ];
actual = sentencize( str );
t.deepEqual( actual, expected, 'keeps sentence with exclamation in middle quotes together' );

str = '"Is this correct?" he wondered. "I think so!" she replied.';
expected = [ '"Is this correct?" he wondered.', '"I think so!" she replied.' ];
actual = sentencize( str );
t.deepEqual( actual, expected, 'correctly handles multiple quoted sentences with different punctuation' );

t.end();
});