/**
* @author Robert Chen
*/
public class CharUtil
{
/**
* Ref: http://www.micmiu.com/lang/java/java-check-chinese/
* @param ch
* @return
*/
public static boolean IsChinese( char ch ) {
Character.UnicodeBlock ub = Character.UnicodeBlock.of( ch );
if ( ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
|| ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION
|| ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS
|| ub == Character.UnicodeBlock.GENERAL_PUNCTUATION) {
return true;
}
return false;
}
/**
* Printable characters
* @param ch
* @return
*/
public static boolean IsPrintableAsciiChar( char ch ) {
if ( 32 <= ch && ch <= 126 )
return true ;
return false ;
}
public static void main( String[] args ) {
// Ref: http://tw.gitbook.net/java/lang/character.unicodeblock_of.html
// 32~126: Printable characters
for ( char ch = 0; ch < 128 ; ++ch )
System.out.println( Character.UnicodeBlock.of( ch ) ); // All is "BASIC_LATIN".
System.out.println();
// Ref: https://www.yiibai.com/java/lang/character.unicodeblock_of_codepoint.html#article-start
System.out.println( Character.UnicodeBlock.of(950) ); // "GREEK"
}
}
使用:
String utf8Text = "" ;
int utf8TextLen = utf8Text.length() ;
StringBuffer illegalCharSB = new StringBuffer( utf8TextLen / 2 );
// It's legal character or not.
for ( int i=0; i < utf8TextLen ; ++i ) {
char testChar = utf8Text.charAt( i ) ;
if ( CharUtil.IsPrintableAsciiChar( testChar ) || CharUtil.IsChinese( testChar ) )
continue;
illegalCharSB.append( " '" + testChar + "'" ) ;
}
還參考了:
● Regex Tutorial - Unicode Characters and Properties
● Regex Tutorial - Literal Characters and Special Characters
● Unicode等價性 - 維基百科,自由的百科全書
● 对字符串进行验证之前先进行规范化 - 我的技术旅程 - ITeye博客
● unicode - Detect Chinese character in java - Stack Overflow
● Java 中文字符判断 中文标点符号判断 - Tong Zeng - 博客园
● 使用Java代码过滤掉乱码字符 - CSDN博客
