393. UTF-8 Validation

  • 34.9%

https://leetcode.com/problems/utf-8-validation/?tab=Description

A character in UTF8 can be from 1 to 4 bytes long, subjected to the following rules:

  1. For 1-byte character, the first bit is a 0, followed by its unicode code.
  2. For n-bytes character, the first n-bits are all one’s, the n+1 bit is 0, followed by n-1 bytes with most significant 2 bits being 10.
1
2
3
4
5
6
7
8
9
This is how the UTF-8 encoding would work:

Char. number range | UTF-8 octet sequence
(hexadecimal) | (binary)
--------------------+---------------------------------------------
0000 0000-0000 007F | 0xxxxxxx
0000 0080-0000 07FF | 110xxxxx 10xxxxxx
0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

Given an array of integers representing the data, return whether it is a valid utf-8 encoding.

Note:

The input is an array of integers. Only the least significant 8 bits of each integer is used to store the data. This means each integer represents only 1 byte of data.

1
2
3
4
5
6
Example 1:

data = [197, 130, 1], which represents the octet sequence: 11000101 10000010 00000001.

Return true.
It is a valid utf-8 encoding for a 2-bytes character followed by a 1-byte character.
1
2
3
4
5
6
7
8
Example 2:

data = [235, 140, 4], which represented the octet sequence: 11101011 10001100 00000100.

Return false.
The first 3 bits are all one's and the 4th bit is 0 means it is a 3-bytes character.
The next byte is a continuation byte which starts with 10 and that's correct.
But the second continuation byte does not start with 10, so it is invalid.

方法一:

https://discuss.leetcode.com/topic/57195/concise-c-implementation

Concise C++ implementation

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
class Solution {
public:
bool validUtf8(vector<int>& data) {
int count = 0;
for (auto c : data) {
if (count == 0) {
if ((c >> 5) == 0b110) count = 1;
else if ((c >> 4) == 0b1110) count = 2;
else if ((c >> 3) == 0b11110) count = 3;
else if ((c >> 7)) return false;
} else {
if ((c >> 6) != 0b10) return false;
count--;
}
}
return count == 0;
}
};

补充资料:

C++ 的二进制语法与语义

转载请注明出处:http://www.cnblogs.com/Martinium/p/binary_literal.html

二进制的语法

  C/C++ 默认数字使用十进制,八进制使用前缀 0, 十六进制使用前缀 0x 或 0X,二进制常数的提议被否决(引用 C 语言程序原理国际标准 的 6.4.4.1 章节字段 “A proposal to add binary constants was rejected due to lack of precedent and insufficient utility.”),一直没有二进制的表示方法,GCC 使用 0b/0B 前缀作为扩展,其实很多编译器都有这个扩展的,只是标准委员会一直没采纳。直到 C++14 才引进,可谓姗姗来迟啊。这种二进制语义 Java 7 也早些时候引入,Python 2.6 以及后来的 3 也引入,Python 为了避免十进制与八进制的混淆,一律用字母前缀,不分大小写,0b 为二进制,0o 为八进制,ox 为十六进制。Python2 升到 Python3 时果断丢弃了原来的八进制语义(也就是在 C/C++/Java 里面有 0 前缀的八进制的语义)。

表示方法

  1. 下面每行语句表达语义相同,只是选用不同的进制数表示。
1
2
3
4
int i = 0b101010;  // binary
int i = 052; // octal
int i = 42; // decimal
int i = 0x2a; // hexadecimal

方法二:

根据题意,比较直白的写法

我的代码实现:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
class Solution {
public:
bool validUtf8(vector<int>& data) {
if(data.empty()) return false;
// if(data.size()==1) return !(data[0]&(1<<7));
int i = 0, n = data.size(), cnt, flag;
while(i<n){
flag = 1<<7;
// (flag&data[i])==0 而不是 flag & data[i] == 0
// 位运算优先级低于关系运算符
if((flag & data[i]) == 0){
i++;
}else{
cnt = 0;
while(flag>2 && (data[i] & (flag>>1))){
cnt++;
flag = flag>>1;
}
// 考虑[145] = [10010001]的情况
// 注意不能超过4长度
if(cnt==0 || cnt>3)
return false;
for(int j=0; j<cnt; j++){
if(i+1<n && (data[i+1] & (1<<7)) && (data[i+1] & (1<<6))==0){
i++;
continue;
}
else
return false;
}
i++;
}
}
return true;
}
};

https://discuss.leetcode.com/topic/58159/feeling-like-an-english-reading-comprehension-problem

Feeling like an English reading comprehension problem

Finally I see it is to judge a UTF-8 char sequence, while the rules described are for a single char.

I feel more likely working on an English reading comprehension problem rather than algorithm.
Sigh.


https://discuss.leetcode.com/topic/57192/one-pass-simple-solution

one pass simple solution

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
public bool ValidUtf8(int[] data) {
int bitCount = 0;

foreach(int n in data){

if(n >= 192){
if(bitCount != 0)
return false;
else if(n >= 240)
bitCount = 3;
else if(n >= 224)
bitCount = 2;
else
bitCount = 1;
}else if(n >= 128){
bitCount--;
if(bitCount < 0)
return false;
}else if(bitCount > 0){
return false;
}
}

return bitCount == 0;
}

https://discuss.leetcode.com/topic/58338/bit-manipulation-java-6ms

Bit Manipulation, Java, 6ms

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
public boolean validUtf8(int[] data) {
if(data==null || data.length==0) return false;
boolean isValid = true;
for(int i=0;i<data.length;i++) {
if(data[i]>255) return false; // 1 after 8th digit, 100000000
int numberOfBytes = 0;
if((data[i] & 128) == 0) { // 0xxxxxxx, 1 byte, 128(10000000)
numberOfBytes = 1;
} else if((data[i] & 224) == 192) { // 110xxxxx, 2 bytes, 224(11100000), 192(11000000)
numberOfBytes = 2;
} else if((data[i] & 240) == 224) { // 1110xxxx, 3 bytes, 240(11110000), 224(11100000)
numberOfBytes = 3;
} else if((data[i] & 248) == 240) { // 11110xxx, 4 bytes, 248(11111000), 240(11110000)
numberOfBytes = 4;
} else {
return false;
}
for(int j=1;j<numberOfBytes;j++) { // check that the next n bytes start with 10xxxxxx
if(i+j>=data.length) return false;
if((data[i+j] & 192) != 128) return false; // 192(11000000), 128(10000000)
}
i=i+numberOfBytes-1;
}
return isValid;
}

https://discuss.leetcode.com/topic/57178/o-n-solution-using-java

O(n) solution using Java

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
public class Solution {
public boolean validUtf8(int[] data) {
int n = data.length;
if (n == 0) return true;
int skip = 0b10000000;
int check = 0;
for (int i = 0; i < data.length; i++) {
if (check > 0) {
if ((data[i] & skip) == skip) check--;
else return false;
} else {
check = getOneBitCountFromHead(data[i]);
if (check < 0) return false;
}
}
return check == 0;
}
private int getOneBitCountFromHead(int num) {
if ((num & 0b11110000) == 0b11110000) return 3;
if ((num & 0b11100000) == 0b11100000) return 2;
if ((num & 0b11000000) == 0b11000000) return 1;
if ((num & 0b10000000) == 0b10000000) return -1; //error
return 0;
}
}

https://discuss.leetcode.com/topic/57964/short-n-clean-12-lines-python-solution

Short’n’Clean 12-lines Python solution

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
def check(nums, start, size):
for i in range(start + 1, start + size + 1):
if i >= len(nums) or (nums[i] >> 6) != 0b10: return False
return True

class Solution(object):
def validUtf8(self, nums, start=0):
while start < len(nums):
first = nums[start]
if (first >> 3) == 0b11110 and check(nums, start, 3): start += 4
elif (first >> 4) == 0b1110 and check(nums, start, 2): start += 3
elif (first >> 5) == 0b110 and check(nums, start, 1): start += 2
elif (first >> 7) == 0: start += 1
else: return False
return True

# 45 / 45 test cases passed.
# Status: Accepted
# Runtime: 89 ms

https://discuss.leetcode.com/topic/57094/python-o-n-scan

Python O(n) scan

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
class Solution(object):
def validUtf8(self, data):
"""
:type data: List[int]
:rtype: bool
"""
if len(data) == 0:
return True
i = 0
while i < len(data):
if data[i] < 128:
i += 1
elif data[i] >= 192 and data[i] < 224 and len(data)-i>=2:
if data[i+1] >= 128 and data[i+1] < 192:
i += 2
else:
return False
elif data[i] >= 224 and data[i] < 240 and len(data)-i>=3:
if data[i+1] >= 128 and data[i+1] < 192 and data[i+2] >= 128 and data[i+2] < 192:
i += 3
else:
return False
elif data[i] >= 240 and data[i] < 248 and len(data)-i>=4:
if data[i+1] >= 128 and data[i+1] < 192 and data[i+2] >= 128 and data[i+2] < 192 and data[i+3] >= 128 and data[i+3] < 192:
i += 4
else:
return False
else:
return False
return True
谢谢你,可爱的朋友。