Skip to content

Disallow unbalanced bidirectional unicode #288

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/kinds.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ const _kind_names =
"ErrorInvalidUTF8"
"ErrorInvisibleChar"
"ErrorUnknownCharacter"
"ErrorBidiFormatting"
# Generic error
"error"
"END_ERRORS"
Expand Down Expand Up @@ -1049,6 +1050,7 @@ const _nonunique_kind_names = Set([
K"ErrorInvalidUTF8"
K"ErrorInvisibleChar"
K"ErrorUnknownCharacter"
K"ErrorBidiFormatting"
K"ErrorInvalidOperator"

K"Integer"
Expand Down Expand Up @@ -1098,6 +1100,7 @@ const _token_error_descriptions = Dict{Kind, String}(
K"ErrorInvalidUTF8"=>"invalid UTF-8 character",
K"ErrorInvisibleChar"=>"invisible character",
K"ErrorUnknownCharacter"=>"unknown unicode character",
K"ErrorBidiFormatting"=>"unbalanced bidirectional unicode formatting",
K"ErrorInvalidOperator" => "invalid operator",
K"Error**" => "use `x^y` instead of `x**y` for exponentiation, and `x...` instead of `**x` for splatting",
K"error" => "unknown error token",
Expand Down
2 changes: 2 additions & 0 deletions src/parse_stream.jl
Original file line number Diff line number Diff line change
Expand Up @@ -949,6 +949,8 @@ function validate_tokens(stream::ParseStream)
# Emit messages for non-generic token errors
msg = if k in KSet"ErrorInvalidUTF8 ErrorInvisibleChar ErrorUnknownCharacter"
"$(_token_error_descriptions[k]) $(repr(text[fbyte]))"
elseif k == K"ErrorBidiFormatting"
"$(_token_error_descriptions[k]) $(repr(text[fbyte:prevind(text, nbyte)]))"
else
_token_error_descriptions[k]
end
Expand Down
5 changes: 5 additions & 0 deletions src/parser.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3282,6 +3282,9 @@ function parse_string(ps::ParseState, raw::Bool)
first_chunk = false
n_valid_chunks += 1
end
elseif k == K"ErrorInvalidInterpolationTerminator" || k == K"ErrorBidiFormatting"
# Treat these errors as string chunks
bump(ps)
else
break
end
Expand Down Expand Up @@ -3381,6 +3384,8 @@ function parse_atom(ps::ParseState, check_identifiers=true)
else
if k == K"Char"
bump(ps)
elseif is_error(k)
bump(ps)
else
# FIXME: This case is actually a tokenization error.
# Make a best-effort attempt to workaround this for now by
Expand Down
114 changes: 72 additions & 42 deletions src/tokenize.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ module Tokenize

export tokenize, untokenize, Tokens

using ..JuliaSyntax: JuliaSyntax, Kind, @K_str
using ..JuliaSyntax: JuliaSyntax, Kind, @K_str, @KSet_str

import ..JuliaSyntax: kind,
is_literal, is_error, is_contextual_keyword, is_word_operator
Expand Down Expand Up @@ -382,9 +382,6 @@ end

Returns the next character and increments the current position.
"""
function readchar end


function readchar(l::Lexer)
c = readchar(l.io)
l.chars = (l.chars[2], l.chars[3], l.chars[4], c)
Expand Down Expand Up @@ -446,17 +443,6 @@ function emit(l::Lexer, kind::Kind, maybe_op=true)
return tok
end

"""
emit_error(l::Lexer, err::Kind)

Returns an `K"error"` token with error `err` and starts a new `RawToken`.
"""
function emit_error(l::Lexer, err::Kind)
@assert is_error(err)
return emit(l, err)
end


"""
next_token(l::Lexer)

Expand Down Expand Up @@ -551,20 +537,43 @@ function _next_token(l::Lexer, c)
elseif (k = get(_unicode_ops, c, K"error")) != K"error"
return emit(l, k)
else
emit_error(l,
emit(l,
!isvalid(c) ? K"ErrorInvalidUTF8" :
is_invisible_char(c) ? K"ErrorInvisibleChar" :
K"ErrorUnknownCharacter")
end
end

# UAX #9: Unicode Bidirectional Algorithm
# https://unicode.org/reports/tr9/
# Very partial implementation - just enough to check correct nesting in strings
# and multiline comments.
function update_bidi_state((embedding_nesting, isolate_nesting), c)
if c == '\n'
embedding_nesting = 0
isolate_nesting = 0
elseif c == '\U202A' || c == '\U202B' || c == '\U202D' || c == '\U202E' # LRE RLE LRO RLO
embedding_nesting += 1
elseif c == '\U202C' # PDF
embedding_nesting -= 1
elseif c == '\U2066' || c == '\U2067' || c == '\U2068' # LRI RLI FSI
isolate_nesting += 1
elseif c == '\U2069' # PDI
isolate_nesting -= 1
end
return (embedding_nesting, isolate_nesting)
end

# We're inside a string; possibly reading the string characters, or maybe in
# Julia code within an interpolation.
function lex_string_chunk(l)
state = last(l.string_states)
if state.paren_depth > 0
# Read normal Julia code inside an interpolation but track nesting of
# parentheses.
# TODO: This stateful tracking should probably, somehow, be done by the
# parser instead? Especially for recovery of unbalanced parens inside
# interpolations?
c = readchar(l)
if c == '('
l.string_states[end] = StringState(state.triplestr, state.raw, state.delim,
Expand Down Expand Up @@ -598,7 +607,7 @@ function lex_string_chunk(l)
# Only allow certain characters after interpolated vars
# https://github.com/JuliaLang/julia/pull/25234
readchar(l)
return emit_error(l, K"ErrorInvalidInterpolationTerminator")
return emit(l, K"ErrorInvalidInterpolationTerminator")
end
if pc == EOF_CHAR
return emit(l, K"EndMarker")
Expand Down Expand Up @@ -637,6 +646,8 @@ function lex_string_chunk(l)
end
end
# Read a chunk of string characters
init_bidi_state = (0,0)
bidi_state = init_bidi_state
if state.raw
# Raw strings treat all characters as literals with the exception that
# the closing quotes can be escaped with an odd number of \ characters.
Expand All @@ -647,7 +658,10 @@ function lex_string_chunk(l)
elseif state.triplestr && (pc == '\n' || pc == '\r')
# triple quoted newline splitting
readchar(l)
if pc == '\r' && peekchar(l) == '\n'
if pc == '\n'
bidi_state = init_bidi_state
elseif pc == '\r' && peekchar(l) == '\n'
bidi_state = init_bidi_state
readchar(l)
end
break
Expand All @@ -663,6 +677,7 @@ function lex_string_chunk(l)
readchar(l)
end
end
bidi_state = update_bidi_state(bidi_state, c)
end
else
while true
Expand All @@ -672,29 +687,39 @@ function lex_string_chunk(l)
elseif state.triplestr && (pc == '\n' || pc == '\r')
# triple quoted newline splitting
readchar(l)
if pc == '\r' && peekchar(l) == '\n'
if pc == '\n'
bidi_state = init_bidi_state
elseif pc == '\r' && peekchar(l) == '\n'
readchar(l)
bidi_state = init_bidi_state
end
break
elseif pc == state.delim && string_terminates(l, state.delim, state.triplestr)
break
elseif pc == '\\'
# Escaped newline
pc2 = dpeekchar(l)[2]
_, pc2, pc3 = peekchar3(l)
if pc2 == '\r' || pc2 == '\n'
if pc2 == '\n' || pc3 == '\n'
bidi_state = init_bidi_state
end
break
end
end
c = readchar(l)
if c == '\\'
c = readchar(l)
c == EOF_CHAR && break
continue
end
bidi_state = update_bidi_state(bidi_state, c)
end
end
return emit(l, state.delim == '"' ? K"String" :
state.delim == '`' ? K"CmdString" : K"Char")
outk = state.delim == '\'' ? K"Char" :
bidi_state != init_bidi_state ? K"ErrorBidiFormatting" :
state.delim == '"' ? K"String" :
state.delim == '`' ? K"CmdString" :
(@assert(state.delim in KSet"' \" `"); K"error")
return emit(l, outk)
end

# Lex whitespace, a whitespace char `c` has been consumed
Expand Down Expand Up @@ -725,13 +750,16 @@ function lex_comment(l::Lexer)
end
else
c = readchar(l) # consume the '='
init_bidi_state = (0,0)
bidi_state = init_bidi_state
skip = true # true => c was part of the prev comment marker pair
nesting = 1
while true
if c == EOF_CHAR
return emit_error(l, K"ErrorEofMultiComment")
return emit(l, K"ErrorEofMultiComment")
end
nc = readchar(l)
bidi_state = update_bidi_state(bidi_state, nc)
if skip
skip = false
else
Expand All @@ -742,7 +770,9 @@ function lex_comment(l::Lexer)
nesting -= 1
skip = true
if nesting == 0
return emit(l, K"Comment")
outk = bidi_state == init_bidi_state ?
K"Comment" : K"ErrorBidiFormatting"
return emit(l, outk)
end
end
end
Expand Down Expand Up @@ -791,12 +821,12 @@ function lex_less(l::Lexer)
elseif dpeekchar(l) == ('-', '-')
readchar(l); readchar(l)
if accept(l, '-')
return emit_error(l, K"ErrorInvalidOperator")
return emit(l, K"ErrorInvalidOperator")
else
if accept(l, '>')
return emit(l, K"<-->")
elseif accept(l, '-')
return emit_error(l, K"ErrorInvalidOperator")
return emit(l, K"ErrorInvalidOperator")
else
return emit(l, K"<--")
end
Expand Down Expand Up @@ -879,7 +909,7 @@ function lex_minus(l::Lexer)
if accept(l, '>')
return emit(l, K"-->")
else
return emit_error(l, K"ErrorInvalidOperator") # "--" is an invalid operator
return emit(l, K"ErrorInvalidOperator") # "--" is an invalid operator
end
elseif !l.dotop && accept(l, '>')
return emit(l, K"->")
Expand All @@ -891,7 +921,7 @@ end

function lex_star(l::Lexer)
if accept(l, '*')
return emit_error(l, K"Error**") # "**" is an invalid operator use ^
return emit(l, K"Error**") # "**" is an invalid operator use ^
elseif accept(l, '=')
return emit(l, K"*=")
end
Expand Down Expand Up @@ -952,15 +982,15 @@ function lex_digit(l::Lexer, kind)
elseif kind === K"Float"
# If we enter the function with kind == K"Float" then a '.' has been parsed.
readchar(l)
return emit_error(l, K"ErrorInvalidNumericConstant")
return emit(l, K"ErrorInvalidNumericConstant")
elseif is_dottable_operator_start_char(ppc)
readchar(l)
return emit_error(l, K"ErrorAmbiguousNumericConstant") # `1.+`
return emit(l, K"ErrorAmbiguousNumericConstant") # `1.+`
end
readchar(l)

kind = K"Float"
accept(l, '_') && return emit_error(l, K"ErrorInvalidNumericConstant") # `1._`
accept(l, '_') && return emit(l, K"ErrorInvalidNumericConstant") # `1._`
had_fraction_digs = accept_number(l, isdigit)
pc, ppc = dpeekchar(l)
if (pc == 'e' || pc == 'E' || pc == 'f') && (isdigit(ppc) || ppc == '+' || ppc == '-' || ppc == '−')
Expand All @@ -971,18 +1001,18 @@ function lex_digit(l::Lexer, kind)
pc,ppc = dpeekchar(l)
if pc === '.' && !is_dottable_operator_start_char(ppc)
readchar(l)
return emit_error(l, K"ErrorInvalidNumericConstant") # `1.e1.`
return emit(l, K"ErrorInvalidNumericConstant") # `1.e1.`
end
else
return emit_error(l, K"ErrorInvalidNumericConstant") # `1.e`
return emit(l, K"ErrorInvalidNumericConstant") # `1.e`
end
elseif pc == '.' && ppc != '.' && !is_dottable_operator_start_char(ppc)
readchar(l)
return emit_error(l, K"ErrorInvalidNumericConstant") # `1.1.`
return emit(l, K"ErrorInvalidNumericConstant") # `1.1.`
elseif !had_fraction_digs && (is_identifier_start_char(pc) ||
pc == '(' || pc == '[' || pc == '{' ||
pc == '@' || pc == '`' || pc == '"')
return emit_error(l, K"ErrorAmbiguousNumericDotMultiply") # `1.(` `1.x`
return emit(l, K"ErrorAmbiguousNumericDotMultiply") # `1.(` `1.x`
end
elseif (pc == 'e' || pc == 'E' || pc == 'f') && (isdigit(ppc) || ppc == '+' || ppc == '-' || ppc == '−')
kind = pc == 'f' ? K"Float32" : K"Float"
Expand All @@ -992,10 +1022,10 @@ function lex_digit(l::Lexer, kind)
pc,ppc = dpeekchar(l)
if pc === '.' && !is_dottable_operator_start_char(ppc)
accept(l, '.')
return emit_error(l, K"ErrorInvalidNumericConstant") # `1e1.`
return emit(l, K"ErrorInvalidNumericConstant") # `1e1.`
end
else
return emit_error(l, K"ErrorInvalidNumericConstant") # `1e+`
return emit(l, K"ErrorInvalidNumericConstant") # `1e+`
end
elseif position(l) - startpos(l) == 1 && l.chars[1] == '0'
kind == K"Integer"
Expand All @@ -1015,10 +1045,10 @@ function lex_digit(l::Lexer, kind)
kind = K"Float"
accept(l, "+-−")
if !accept_number(l, isdigit) || !had_digits
return emit_error(l, K"ErrorInvalidNumericConstant") # `0x1p` `0x.p0`
return emit(l, K"ErrorInvalidNumericConstant") # `0x1p` `0x.p0`
end
elseif isfloat
return emit_error(l, K"ErrorHexFloatMustContainP") # `0x.` `0x1.0`
return emit(l, K"ErrorHexFloatMustContainP") # `0x.` `0x1.0`
end
is_bin_oct_hex_int = !isfloat
elseif pc == 'b'
Expand All @@ -1038,7 +1068,7 @@ function lex_digit(l::Lexer, kind)
accept_batch(l, c->isdigit(c) || is_identifier_start_char(c))
# `0x` `0xg` `0x_` `0x-`
# `0b123` `0o78p` `0xenomorph` `0xaα`
return emit_error(l, K"ErrorInvalidNumericConstant")
return emit(l, K"ErrorInvalidNumericConstant")
end
end
end
Expand Down Expand Up @@ -1132,7 +1162,7 @@ function lex_dot(l::Lexer)
else
if is_dottable_operator_start_char(peekchar(l))
readchar(l)
return emit_error(l, K"ErrorInvalidOperator")
return emit(l, K"ErrorInvalidOperator")
else
return emit(l, K"..")
end
Expand Down
7 changes: 7 additions & 0 deletions test/diagnostics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,13 @@ end
Diagnostic(2, 1+sizeof(string(c)), :error, "invisible character $(repr(c))")
end
@test diagnostic(":⥻") == Diagnostic(2, 4, :error, "unknown unicode character '⥻'")

@test diagnostic("\"X \u202a X\"") == Diagnostic(2, 8, :error, "unbalanced bidirectional unicode formatting \"X \\u202a X\"")
@test diagnostic("#= \u202a =#") == Diagnostic(1, 9, :error, "unbalanced bidirectional unicode formatting \"#= \\u202a =#\"")
@test diagnostic("\"X \u202a \$xx\u202c\"", allow_multiple=true) == [
Diagnostic(2, 7, :error, "unbalanced bidirectional unicode formatting \"X \\u202a \"")
Diagnostic(11, 13, :error, "unbalanced bidirectional unicode formatting \"\\u202c\"")
]
end

@testset "parser errors" begin
Expand Down
Loading