close

 

/**
 * @author Robert Chen
 */
public class CharUtil
{
    /**
     * Ref: http://www.micmiu.com/lang/java/java-check-chinese/
     * @param ch
     * @return
     */
    public static boolean IsChinese( char ch ) {
        Character.UnicodeBlock ub = Character.UnicodeBlock.of( ch );
        if ( ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
            || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
            || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
            || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
            || ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION
            || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS
            || ub == Character.UnicodeBlock.GENERAL_PUNCTUATION) {
            return true;
        }
        return false;
    }
    
    /**
     * Printable characters
     * @param ch
     * @return
     */
    public static boolean IsPrintableAsciiChar( char ch ) {
        if ( 32 <= ch && ch <= 126 )
            return true ;
        return false ;
    }
    
    
    public static void main( String[] args ) {
        // Ref: http://tw.gitbook.net/java/lang/character.unicodeblock_of.html
        
        // 32~126: Printable characters
        for ( char ch = 0; ch < 128 ; ++ch )
            System.out.println( Character.UnicodeBlock.of( ch ) );  // All is "BASIC_LATIN".
        System.out.println();
        
        
        
        // Ref: https://www.yiibai.com/java/lang/character.unicodeblock_of_codepoint.html#article-start
        System.out.println( Character.UnicodeBlock.of(950) );  // "GREEK"
    }
}

 

使用:

    String utf8Text = "" ;
    
    int utf8TextLen = utf8Text.length() ;
    StringBuffer illegalCharSB = new StringBuffer( utf8TextLen / 2 );
    
    // It's legal character or not.
    for ( int i=0; i < utf8TextLen ; ++i ) {
        char testChar = utf8Text.charAt( i ) ;
        if ( CharUtil.IsPrintableAsciiChar( testChar ) || CharUtil.IsChinese( testChar ) )
            continue;
        illegalCharSB.append( " '" + testChar + "'" ) ;
    }

 

還參考了:
Regex Tutorial - Unicode Characters and Properties
Regex Tutorial - Literal Characters and Special Characters
Unicode等價性 - 維基百科,自由的百科全書
对字符串进行验证之前先进行规范化 - 我的技术旅程 - ITeye博客
unicode - Detect Chinese character in java - Stack Overflow
Java 中文字符判断 中文标点符号判断 - Tong Zeng - 博客园
使用Java代码过滤掉乱码字符 - CSDN博客
 

 

arrow
arrow

    Robert 發表在 痞客邦 留言(0) 人氣()