001//////////////////////////////////////////////////////////////////////////////// 002// checkstyle: Checks Java source code for adherence to a set of rules. 003// Copyright (C) 2001-2016 the original author or authors. 004// 005// This library is free software; you can redistribute it and/or 006// modify it under the terms of the GNU Lesser General Public 007// License as published by the Free Software Foundation; either 008// version 2.1 of the License, or (at your option) any later version. 009// 010// This library is distributed in the hope that it will be useful, 011// but WITHOUT ANY WARRANTY; without even the implied warranty of 012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 013// Lesser General Public License for more details. 014// 015// You should have received a copy of the GNU Lesser General Public 016// License along with this library; if not, write to the Free Software 017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 018//////////////////////////////////////////////////////////////////////////////// 019 020package com.puppycrawl.tools.checkstyle.checks; 021 022import java.util.List; 023import java.util.Map; 024import java.util.regex.Matcher; 025import java.util.regex.Pattern; 026 027import com.puppycrawl.tools.checkstyle.api.AbstractCheck; 028import com.puppycrawl.tools.checkstyle.api.DetailAST; 029import com.puppycrawl.tools.checkstyle.api.TextBlock; 030import com.puppycrawl.tools.checkstyle.api.TokenTypes; 031 032/** 033 * <p> 034 * Restrict using <a href = 035 * "http://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.3"> 036 * Unicode escapes</a> (e.g. \u221e). 037 * It is possible to allow using escapes for 038 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters"> 039 * non-printable(control) characters</a>. 040 * Also, this check can be configured to allow using escapes 041 * if trail comment is present. By the option it is possible to 042 * allow using escapes if literal contains only them. By the option it 043 * is possible to allow using escapes for space literals. 044 * </p> 045 * <p> 046 * Examples of using Unicode:</p> 047 * <pre> 048 * String unitAbbrev = "μs"; //Best: perfectly clear even without a comment. 049 * String unitAbbrev = "\u03bcs"; //Poor: the reader has no idea what this is. 050 * </pre> 051 * <p> 052 * An example of how to configure the check is: 053 * </p> 054 * <pre> 055 * <module name="AvoidEscapedUnicodeCharacters"/> 056 * </pre> 057 * <p> 058 * An example of non-printable(control) characters. 059 * </p> 060 * <pre> 061 * return '\ufeff' + content; // byte order mark 062 * </pre> 063 * <p> 064 * An example of how to configure the check to allow using escapes 065 * for non-printable(control) characters: 066 * </p> 067 * <pre> 068 * <module name="AvoidEscapedUnicodeCharacters"> 069 * <property name="allowEscapesForControlCharacters" value="true"/> 070 * </module> 071 * </pre> 072 * <p> 073 * Example of using escapes with trail comment: 074 * </p> 075 * <pre> 076 * String unitAbbrev = "\u03bcs"; // Greek letter mu, "s" 077 * </pre> 078 * <p>An example of how to configure the check to allow using escapes 079 * if trail comment is present: 080 * </p> 081 * <pre> 082 * <module name="AvoidEscapedUnicodeCharacters"> 083 * <property name="allowByTailComment" value="true"/> 084 * </module> 085 * </pre> 086 * <p>Example of using escapes if literal contains only them: 087 * </p> 088 * <pre> 089 * String unitAbbrev = "\u03bc\u03bc\u03bc"; 090 * </pre> 091 * <p>An example of how to configure the check to allow escapes 092 * if literal contains only them: 093 * </p> 094 * <pre> 095 * <module name="AvoidEscapedUnicodeCharacters"> 096 * <property name="allowIfAllCharactersEscaped" value="true"/> 097 * </module> 098 * </pre> 099 * <p>An example of how to configure the check to allow non-printable escapes: 100 * </p> 101 * <pre> 102 * <module name="AvoidEscapedUnicodeCharacters"> 103 * <property name="allowNonPrintableEscapes" value="true"/> 104 * </module> 105 * </pre> 106 * 107 * @author maxvetrenko 108 * 109 */ 110public class AvoidEscapedUnicodeCharactersCheck 111 extends AbstractCheck { 112 /** 113 * A key is pointing to the warning message text in "messages.properties" 114 * file. 115 */ 116 public static final String MSG_KEY = "forbid.escaped.unicode.char"; 117 118 /** Regular expression for Unicode chars. */ 119 private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u[a-fA-F0-9]{4}"); 120 121 /** Regular expression Unicode control characters. */ 122 private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\(u|U)" 123 + "(00[0-1][0-1A-Fa-f]|00[8-9][0-9A-Fa-f]|034(f|F)|070(f|F)" 124 + "|180(e|E)|200[b-fB-F]|202[b-eB-E]|206[0-4a-fA-F]" 125 + "|[fF]{3}[9a-bA-B]|[fF][eE][fF]{2})"); 126 127 /** Regular expression for all escaped chars. */ 128 private static final Pattern ALL_ESCAPED_CHARS = 129 Pattern.compile("^((\\\\u)[a-fA-F0-9]{4}" 130 + "||\\\\b|\\\\t|\\\\n|\\\\f|\\\\r|\\\\|\"|\')+$"); 131 132 /** Regular expression for non-printable unicode chars. */ 133 private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u1680|\\\\u2028" 134 + "|\\\\u2029|\\\\u205(f|F)|\\\\u3000|\\\\u2007|\\\\u2000|\\\\u200(a|A)" 135 + "|\\\\u007(F|f)|\\\\u009(f|F)|\\\\u(f|F){4}|\\\\u007(F|f)|\\\\u00(a|A)(d|D)" 136 + "|\\\\u0600|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)" 137 + "|\\\\u2000|\\\\u2028|\\\\u205(f|F)|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069" 138 + "|\\\\u206(a|A)|\\\\u(d|D)800|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9" 139 + "|\\\\u(f|F){3}(a|A)|\\\\u0020|\\\\u00(a|A)0|\\\\u00(a|A)(d|D)|\\\\u0604" 140 + "|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)|\\\\u200(f|F)" 141 + "|\\\\u202(f|F)|\\\\u2064|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069|\\\\u206(f|F)" 142 + "|\\\\u(f|F)8(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9|\\\\u(f|F){3}(b|B)" 143 + "|\\\\u05(d|D)0|\\\\u05(f|F)3|\\\\u0600|\\\\u0750|\\\\u0(e|E)00|\\\\u1(e|E)00" 144 + "|\\\\u2100|\\\\u(f|F)(b|B)50|\\\\u(f|F)(e|E)70|\\\\u(F|f){2}61|\\\\u04(f|F)9" 145 + "|\\\\u05(b|B)(e|E)|\\\\u05(e|E)(a|A)|\\\\u05(f|F)4|\\\\u06(f|F){2}" 146 + "|\\\\u077(f|F)|\\\\u0(e|E)7(f|F)|\\\\u20(a|A)(f|F)|\\\\u213(a|A)|\\\\u0000" 147 + "|\\\\u(f|F)(d|D)(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){2}(d|D)(c|C)" 148 + "|\\\\u2002|\\\\u0085|\\\\u200(a|A)|\\\\u2005|\\\\u2000|\\\\u2029|\\\\u000(B|b)" 149 + "|\\\\u2008|\\\\u2003|\\\\u205(f|F)|\\\\u1680|\\\\u0009|\\\\u0020|\\\\u2006" 150 + "|\\\\u2001|\\\\u202(f|F)|\\\\u00(a|A)0|\\\\u000(c|C)|\\\\u2009|\\\\u2004|\\\\u2028" 151 + "|\\\\u2028|\\\\u2007|\\\\u2004|\\\\u2028|\\\\u2007|\\\\u2025" 152 + "|\\\\u(f|F){2}0(e|E)|\\\\u(f|F){2}61"); 153 154 /** Cpp style comments. */ 155 private Map<Integer, TextBlock> singlelineComments; 156 /** C style comments. */ 157 private Map<Integer, List<TextBlock>> blockComments; 158 159 /** Allow use escapes for non-printable(control) characters. */ 160 private boolean allowEscapesForControlCharacters; 161 162 /** Allow use escapes if trail comment is present. */ 163 private boolean allowByTailComment; 164 165 /** Allow if all characters in literal are escaped. */ 166 private boolean allowIfAllCharactersEscaped; 167 168 /** Allow escapes for space literals. */ 169 private boolean allowNonPrintableEscapes; 170 171 /** 172 * Set allowIfAllCharactersEscaped. 173 * @param allow user's value. 174 */ 175 public final void setAllowEscapesForControlCharacters(boolean allow) { 176 allowEscapesForControlCharacters = allow; 177 } 178 179 /** 180 * Set allowByTailComment. 181 * @param allow user's value. 182 */ 183 public final void setAllowByTailComment(boolean allow) { 184 allowByTailComment = allow; 185 } 186 187 /** 188 * Set allowIfAllCharactersEscaped. 189 * @param allow user's value. 190 */ 191 public final void setAllowIfAllCharactersEscaped(boolean allow) { 192 allowIfAllCharactersEscaped = allow; 193 } 194 195 /** 196 * Set allowSpaceEscapes. 197 * @param allow user's value. 198 */ 199 public final void setAllowNonPrintableEscapes(boolean allow) { 200 allowNonPrintableEscapes = allow; 201 } 202 203 @Override 204 public int[] getDefaultTokens() { 205 return getAcceptableTokens(); 206 } 207 208 @Override 209 public int[] getAcceptableTokens() { 210 return new int[] {TokenTypes.STRING_LITERAL, TokenTypes.CHAR_LITERAL}; 211 } 212 213 @Override 214 public int[] getRequiredTokens() { 215 return getAcceptableTokens(); 216 } 217 218 @Override 219 public void beginTree(DetailAST rootAST) { 220 singlelineComments = getFileContents().getCppComments(); 221 blockComments = getFileContents().getCComments(); 222 } 223 224 @Override 225 public void visitToken(DetailAST ast) { 226 227 final String literal = ast.getText(); 228 229 if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast) 230 || isAllCharactersEscaped(literal) 231 || allowEscapesForControlCharacters 232 && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL) 233 || allowNonPrintableEscapes 234 && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) { 235 log(ast.getLineNo(), MSG_KEY); 236 } 237 } 238 239 /** 240 * Checks if literal has Unicode chars. 241 * @param literal String literal. 242 * @return true if literal has Unicode chars. 243 */ 244 private static boolean hasUnicodeChar(String literal) { 245 return UNICODE_REGEXP.matcher(literal).find(); 246 } 247 248 /** 249 * Check if String literal contains Unicode control chars. 250 * @param literal String literal. 251 * @param pattern RegExp for valid characters. 252 * @return true, if String literal contains Unicode control chars. 253 */ 254 private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) { 255 final int unicodeMatchesCounter = 256 countMatches(UNICODE_REGEXP, literal); 257 final int unicodeValidMatchesCounter = 258 countMatches(pattern, literal); 259 return unicodeMatchesCounter - unicodeValidMatchesCounter == 0; 260 } 261 262 /** 263 * Check if trail comment is present after ast token. 264 * @param ast current token. 265 * @return true if trail comment is present after ast token. 266 */ 267 private boolean hasTrailComment(DetailAST ast) { 268 boolean result = false; 269 final int lineNo = ast.getLineNo(); 270 if (singlelineComments.containsKey(lineNo)) { 271 result = true; 272 } 273 else { 274 final String line = getLines()[lineNo - 1]; 275 final List<TextBlock> commentList = blockComments.get(lineNo); 276 if (commentList != null) { 277 final TextBlock comment = commentList.get(commentList.size() - 1); 278 result = isTrailingCComent(comment, line); 279 } 280 } 281 return result; 282 } 283 284 /** 285 * Whether the C style comment is trailing. 286 * @param comment the comment to check. 287 * @param line the line where the comment starts. 288 * @return true if the comment is trailing. 289 */ 290 private static boolean isTrailingCComent(TextBlock comment, String line) { 291 return comment.getText().length != 1 292 || line.substring(comment.getEndColNo() + 1).trim().isEmpty(); 293 } 294 295 /** 296 * Count regexp matches into String literal. 297 * @param pattern pattern. 298 * @param target String literal. 299 * @return count of regexp matches. 300 */ 301 private static int countMatches(Pattern pattern, String target) { 302 int matcherCounter = 0; 303 final Matcher matcher = pattern.matcher(target); 304 while (matcher.find()) { 305 matcherCounter++; 306 } 307 return matcherCounter; 308 } 309 310 /** 311 * Checks if all characters in String literal is escaped. 312 * @param literal current literal. 313 * @return true if all characters in String literal is escaped. 314 */ 315 private boolean isAllCharactersEscaped(String literal) { 316 return allowIfAllCharactersEscaped 317 && ALL_ESCAPED_CHARS.matcher(literal.substring(1, 318 literal.length() - 1)).find(); 319 } 320}