Skip to content

Commit c2a95ad

Browse files
authored
[flang][runtime] Handle multi-byte characters while tabbing (#101388)
When repositioning within the current record with control edit descriptors (Xn, Tn, TLn, TRn), deal with multiple-byte character encodings. This affects only external I/O to units with UTF-8 encoding.
1 parent b1a1d4e commit c2a95ad

File tree

9 files changed

+165
-22
lines changed

9 files changed

+165
-22
lines changed

flang/runtime/format-implementation.h

Lines changed: 86 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,84 @@ RT_API_ATTRS int FormatControl<CONTEXT>::GetIntField(
113113
return result;
114114
}
115115

116+
// Xn, TRn, TLn
117+
template <typename CONTEXT>
118+
static RT_API_ATTRS bool RelativeTabbing(CONTEXT &context, int n) {
119+
ConnectionState &connection{context.GetConnectionState()};
120+
if constexpr (std::is_same_v<CONTEXT,
121+
ExternalFormattedIoStatementState<Direction::Input>> ||
122+
std::is_same_v<CONTEXT,
123+
ExternalFormattedIoStatementState<Direction::Output>>) {
124+
if (n != 0 && connection.isUTF8) {
125+
const char *p{};
126+
if (n > 0) { // Xn or TRn
127+
// Skip 'n' multi-byte characters. If that's more than are in the
128+
// current record, that's valid -- the program can position past the
129+
// end and then reposition back with Tn or TLn.
130+
std::size_t bytesLeft{context.ViewBytesInRecord(p, true)};
131+
for (; n > 0 && bytesLeft && p; --n) {
132+
std::size_t byteCount{MeasureUTF8Bytes(*p)};
133+
if (byteCount > bytesLeft) {
134+
break;
135+
}
136+
context.HandleRelativePosition(byteCount);
137+
bytesLeft -= byteCount;
138+
// Don't call GotChar(byteCount), these don't count towards SIZE=
139+
p += byteCount;
140+
}
141+
} else { // n < 0: TLn
142+
n = -n;
143+
if (std::int64_t excess{connection.positionInRecord -
144+
connection.recordLength.value_or(connection.positionInRecord)};
145+
excess > 0) {
146+
// Have tabbed past the end of the record
147+
if (excess >= n) {
148+
context.HandleRelativePosition(-n);
149+
return true;
150+
}
151+
context.HandleRelativePosition(-excess);
152+
n -= excess;
153+
}
154+
std::size_t bytesLeft{context.ViewBytesInRecord(p, false)};
155+
// Go back 'n' multi-byte characters.
156+
for (; n > 0 && bytesLeft && p; --n) {
157+
std::size_t byteCount{MeasurePreviousUTF8Bytes(p, bytesLeft)};
158+
context.HandleRelativePosition(-byteCount);
159+
bytesLeft -= byteCount;
160+
p -= byteCount;
161+
}
162+
}
163+
}
164+
}
165+
if (connection.internalIoCharKind > 1) {
166+
n *= connection.internalIoCharKind;
167+
}
168+
context.HandleRelativePosition(n);
169+
return true;
170+
}
171+
172+
// Tn
173+
template <typename CONTEXT>
174+
static RT_API_ATTRS bool AbsoluteTabbing(CONTEXT &context, int n) {
175+
ConnectionState &connection{context.GetConnectionState()};
176+
n = n > 0 ? n - 1 : 0; // convert 1-based position to 0-based offset
177+
if constexpr (std::is_same_v<CONTEXT,
178+
ExternalFormattedIoStatementState<Direction::Input>> ||
179+
std::is_same_v<CONTEXT,
180+
ExternalFormattedIoStatementState<Direction::Output>>) {
181+
if (connection.isUTF8) {
182+
// Reset to the beginning of the record, then TR(n-1)
183+
connection.HandleAbsolutePosition(0);
184+
return RelativeTabbing(context, n);
185+
}
186+
}
187+
if (connection.internalIoCharKind > 1) {
188+
n *= connection.internalIoCharKind;
189+
}
190+
context.HandleAbsolutePosition(n);
191+
return true;
192+
}
193+
116194
template <typename CONTEXT>
117195
static RT_API_ATTRS void HandleControl(
118196
CONTEXT &context, char ch, char next, int n) {
@@ -169,12 +247,7 @@ static RT_API_ATTRS void HandleControl(
169247
}
170248
break;
171249
case 'X':
172-
if (!next) {
173-
ConnectionState &connection{context.GetConnectionState()};
174-
if (connection.internalIoCharKind > 1) {
175-
n *= connection.internalIoCharKind;
176-
}
177-
context.HandleRelativePosition(n);
250+
if (!next && RelativeTabbing(context, n)) {
178251
return;
179252
}
180253
break;
@@ -190,19 +263,13 @@ static RT_API_ATTRS void HandleControl(
190263
break;
191264
case 'T': {
192265
if (!next) { // Tn
193-
--n; // convert 1-based to 0-based
194-
}
195-
ConnectionState &connection{context.GetConnectionState()};
196-
if (connection.internalIoCharKind > 1) {
197-
n *= connection.internalIoCharKind;
198-
}
199-
if (!next) { // Tn
200-
context.HandleAbsolutePosition(n);
201-
return;
202-
}
203-
if (next == 'L' || next == 'R') { // TLn & TRn
204-
context.HandleRelativePosition(next == 'L' ? -n : n);
205-
return;
266+
if (AbsoluteTabbing(context, n)) {
267+
return;
268+
}
269+
} else if (next == 'R' || next == 'L') { // TRn / TLn
270+
if (RelativeTabbing(context, next == 'L' ? -n : n)) {
271+
return;
272+
}
206273
}
207274
} break;
208275
default:

flang/runtime/internal-unit.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ RT_API_ATTRS bool InternalDescriptorUnit<DIR>::Emit(
8080
template <Direction DIR>
8181
RT_API_ATTRS std::size_t InternalDescriptorUnit<DIR>::GetNextInputBytes(
8282
const char *&p, IoErrorHandler &handler) {
83+
p = nullptr;
8384
if constexpr (DIR == Direction::Output) {
8485
handler.Crash("InternalDescriptorUnit<Direction::Output>::"
8586
"GetNextInputBytes() called");
@@ -98,6 +99,28 @@ RT_API_ATTRS std::size_t InternalDescriptorUnit<DIR>::GetNextInputBytes(
9899
}
99100
}
100101

102+
template <Direction DIR>
103+
RT_API_ATTRS std::size_t InternalDescriptorUnit<DIR>::ViewBytesInRecord(
104+
const char *&p, bool forward) const {
105+
p = nullptr;
106+
auto recl{recordLength.value_or(positionInRecord)};
107+
const char *record{CurrentRecord()};
108+
if (forward) {
109+
if (positionInRecord < recl) {
110+
if (record) {
111+
p = &record[positionInRecord];
112+
}
113+
return recl - positionInRecord;
114+
}
115+
} else {
116+
if (record && positionInRecord <= recl) {
117+
p = &record[positionInRecord];
118+
}
119+
return positionInRecord - leftTabLimit.value_or(0);
120+
}
121+
return 0;
122+
}
123+
101124
template <Direction DIR>
102125
RT_API_ATTRS bool InternalDescriptorUnit<DIR>::AdvanceRecord(
103126
IoErrorHandler &handler) {

flang/runtime/internal-unit.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ template <Direction DIR> class InternalDescriptorUnit : public ConnectionState {
3131

3232
RT_API_ATTRS bool Emit(const char *, std::size_t, IoErrorHandler &);
3333
RT_API_ATTRS std::size_t GetNextInputBytes(const char *&, IoErrorHandler &);
34+
RT_API_ATTRS std::size_t ViewBytesInRecord(const char *&, bool forward) const;
3435
RT_API_ATTRS bool AdvanceRecord(IoErrorHandler &);
3536
RT_API_ATTRS void BackspaceRecord(IoErrorHandler &);
3637
RT_API_ATTRS std::int64_t InquirePos();

flang/runtime/io-stmt.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@ std::size_t IoStatementBase::GetNextInputBytes(const char *&p) {
3232
return 0;
3333
}
3434

35+
std::size_t IoStatementBase::ViewBytesInRecord(
36+
const char *&p, bool forward) const {
37+
p = nullptr;
38+
return 0;
39+
}
40+
3541
bool IoStatementBase::AdvanceRecord(int) { return false; }
3642

3743
void IoStatementBase::BackspaceRecord() {}
@@ -105,6 +111,8 @@ std::size_t InternalIoStatementState<DIR>::GetNextInputBytes(const char *&p) {
105111
return unit_.GetNextInputBytes(p, *this);
106112
}
107113

114+
// InternalIoStatementState<DIR>::ViewBytesInRecord() not needed or defined
115+
108116
template <Direction DIR>
109117
bool InternalIoStatementState<DIR>::AdvanceRecord(int n) {
110118
while (n-- > 0) {
@@ -413,6 +421,12 @@ std::size_t ExternalIoStatementState<DIR>::GetNextInputBytes(const char *&p) {
413421
return unit().GetNextInputBytes(p, *this);
414422
}
415423

424+
template <Direction DIR>
425+
std::size_t ExternalIoStatementState<DIR>::ViewBytesInRecord(
426+
const char *&p, bool forward) const {
427+
return unit().ViewBytesInRecord(p, forward);
428+
}
429+
416430
template <Direction DIR>
417431
bool ExternalIoStatementState<DIR>::AdvanceRecord(int n) {
418432
while (n-- > 0) {

flang/runtime/io-stmt.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ class IoStatementState {
9393
const char *, std::size_t bytes, std::size_t elementBytes = 0);
9494
RT_API_ATTRS bool Receive(char *, std::size_t, std::size_t elementBytes = 0);
9595
RT_API_ATTRS std::size_t GetNextInputBytes(const char *&);
96+
RT_API_ATTRS std::size_t ViewBytesInRecord(const char *&, bool forward) const;
9697
RT_API_ATTRS bool AdvanceRecord(int = 1);
9798
RT_API_ATTRS void BackspaceRecord();
9899
RT_API_ATTRS void HandleRelativePosition(std::int64_t byteOffset);
@@ -132,9 +133,9 @@ class IoStatementState {
132133
RT_API_ATTRS Fortran::common::optional<char32_t> GetCurrentChar(
133134
std::size_t &byteCount);
134135

135-
// The "remaining" arguments to CueUpInput(), SkipSpaces(), & NextInField()
136-
// are always in units of bytes, not characters; the distinction matters
137-
// for internal input from CHARACTER(KIND=2 and 4).
136+
// The result of CueUpInput() and the "remaining" arguments to SkipSpaces()
137+
// and NextInField() are always in units of bytes, not characters; the
138+
// distinction matters for internal input from CHARACTER(KIND=2 and 4).
138139

139140
// For fixed-width fields, return the number of remaining bytes.
140141
// Skip over leading blanks.
@@ -279,6 +280,7 @@ class IoStatementBase : public IoErrorHandler {
279280
RT_API_ATTRS bool Receive(
280281
char *, std::size_t bytes, std::size_t elementBytes = 0);
281282
RT_API_ATTRS std::size_t GetNextInputBytes(const char *&);
283+
RT_API_ATTRS std::size_t ViewBytesInRecord(const char *&, bool forward) const;
282284
RT_API_ATTRS bool AdvanceRecord(int);
283285
RT_API_ATTRS void BackspaceRecord();
284286
RT_API_ATTRS void HandleRelativePosition(std::int64_t);
@@ -448,6 +450,7 @@ class ExternalIoStatementBase : public IoStatementBase {
448450
RT_API_ATTRS ExternalIoStatementBase(
449451
ExternalFileUnit &, const char *sourceFile = nullptr, int sourceLine = 0);
450452
RT_API_ATTRS ExternalFileUnit &unit() { return unit_; }
453+
RT_API_ATTRS const ExternalFileUnit &unit() const { return unit_; }
451454
RT_API_ATTRS MutableModes &mutableModes();
452455
RT_API_ATTRS ConnectionState &GetConnectionState();
453456
RT_API_ATTRS int asynchronousID() const { return asynchronousID_; }
@@ -473,6 +476,7 @@ class ExternalIoStatementState : public ExternalIoStatementBase,
473476
RT_API_ATTRS bool Emit(
474477
const char *, std::size_t bytes, std::size_t elementBytes = 0);
475478
RT_API_ATTRS std::size_t GetNextInputBytes(const char *&);
479+
RT_API_ATTRS std::size_t ViewBytesInRecord(const char *&, bool forward) const;
476480
RT_API_ATTRS bool AdvanceRecord(int = 1);
477481
RT_API_ATTRS void BackspaceRecord();
478482
RT_API_ATTRS void HandleRelativePosition(std::int64_t);
@@ -539,6 +543,7 @@ class ChildIoStatementState : public IoStatementBase,
539543
RT_API_ATTRS bool Emit(
540544
const char *, std::size_t bytes, std::size_t elementBytes = 0);
541545
RT_API_ATTRS std::size_t GetNextInputBytes(const char *&);
546+
RT_API_ATTRS std::size_t ViewBytesInRecord(const char *&, bool forward) const;
542547
RT_API_ATTRS void HandleRelativePosition(std::int64_t);
543548
RT_API_ATTRS void HandleAbsolutePosition(std::int64_t);
544549

flang/runtime/unit.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,24 @@ std::size_t ExternalFileUnit::GetNextInputBytes(
148148
return p ? length : 0;
149149
}
150150

151+
std::size_t ExternalFileUnit::ViewBytesInRecord(
152+
const char *&p, bool forward) const {
153+
p = nullptr;
154+
auto recl{recordLength.value_or(positionInRecord)};
155+
if (forward) {
156+
if (positionInRecord < recl) {
157+
p = Frame() + recordOffsetInFrame_ + positionInRecord;
158+
return recl - positionInRecord;
159+
}
160+
} else {
161+
if (positionInRecord <= recl) {
162+
p = Frame() + recordOffsetInFrame_ + positionInRecord;
163+
}
164+
return positionInRecord - leftTabLimit.value_or(0);
165+
}
166+
return 0;
167+
}
168+
151169
const char *ExternalFileUnit::FrameNextInput(
152170
IoErrorHandler &handler, std::size_t bytes) {
153171
RUNTIME_CHECK(handler, isUnformatted.has_value() && !*isUnformatted);

flang/runtime/unit.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ class ExternalFileUnit : public ConnectionState,
166166
RT_API_ATTRS bool Receive(
167167
char *, std::size_t, std::size_t elementBytes, IoErrorHandler &);
168168
RT_API_ATTRS std::size_t GetNextInputBytes(const char *&, IoErrorHandler &);
169+
RT_API_ATTRS std::size_t ViewBytesInRecord(const char *&, bool forward) const;
169170
RT_API_ATTRS bool BeginReadingRecord(IoErrorHandler &);
170171
RT_API_ATTRS void FinishReadingRecord(IoErrorHandler &);
171172
RT_API_ATTRS bool AdvanceRecord(IoErrorHandler &);

flang/runtime/utf.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,17 @@ RT_OFFLOAD_VAR_GROUP_END
4444
#endif // FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
4545

4646
RT_OFFLOAD_API_GROUP_BEGIN
47+
48+
std::size_t MeasurePreviousUTF8Bytes(const char *end, std::size_t limit) {
49+
// Scan back over UTF-8 continuation bytes, if any
50+
for (std::size_t n{1}; n <= limit; ++n) {
51+
if ((end[-n] & 0xc0) != 0x80) {
52+
return n;
53+
}
54+
}
55+
return limit;
56+
}
57+
4758
// Non-minimal encodings are accepted.
4859
Fortran::common::optional<char32_t> DecodeUTF8(const char *p0) {
4960
const std::uint8_t *p{reinterpret_cast<const std::uint8_t *>(p0)};

flang/runtime/utf.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ static inline RT_API_ATTRS std::size_t MeasureUTF8Bytes(char first) {
5858
return UTF8FirstByteTable[static_cast<std::uint8_t>(first)];
5959
}
6060

61+
RT_API_ATTRS std::size_t MeasurePreviousUTF8Bytes(
62+
const char *end, std::size_t limit);
63+
6164
// Ensure that all bytes are present in sequence in the input buffer
6265
// before calling; use MeasureUTF8Bytes(first byte) to count them.
6366
RT_API_ATTRS Fortran::common::optional<char32_t> DecodeUTF8(const char *);

0 commit comments

Comments
 (0)