View Javadoc

1   /*
2    * This file is part of Pease Plate Template Engine.
3    * 
4    * Pease Plate Template Engine is free software: you can redistribute
5    * it and/or modify it under the terms of the GNU Lesser General 
6    * Public License as published by the Free Software Foundation, 
7    * either version 3 of the License, or any later version.
8    * 
9    * Pease Plate Template Engine is distributed in the hope that it 
10   * will be useful, but WITHOUT ANY WARRANTY; without even the implied
11   * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12   * See the GNU Lesser General Public License for more details.
13   * 
14   * You should have received a copy of the GNU Lesser General Public 
15   * License along with Pease Plate Template Engine. If not, see 
16   * <http://www.gnu.org/licenses/>.
17   * 
18   * Copyright (c) 2008 Manfred HANTSCHEL
19   */
20  package org.peaseplate.internal.lang;
21  
22  import org.peaseplate.locator.TemplateLocator;
23  
24  public class Tokenizer {
25  
26  	private static final String KEYWORD_TRUE = "true";
27  	private static final String KEYWORD_FALSE = "false";
28  	private static final String KEYWORD_THIS = "$this";
29  	private static final String KEYWORD_POP = "$pop";
30  	private static final String KEYWORD_NULL = "null";
31  
32  	public enum Type {
33  		NONE,
34  		IDENTIFIER,
35  		VARIABLE,
36  		NUMERIC,
37  		STRING,
38  		KEY,
39  		KEYWORD_TRUE,
40  		KEYWORD_FALSE,
41  		KEYWORD_NULL,
42  		KEYWORD_THIS,
43  		KEYWORD_POP,
44  		DELEGATOR,
45  		SEPARATOR, 
46  		UNARY_OPERATOR,
47  		MULTIPLICATIVE_OPERATOR,
48  		ADDITIVE_OPERATOR,
49  		SHIFT_OPERATOR,
50  		RELATIONAL_OPERATOR,
51  		EQUALITY_OPERATOR,
52  		BITWISE_OPERATOR,
53  		CONDITIONAL_AND_OPERATOR,
54  		CONDITIONAL_OR_OPERATOR,
55  		INLINE_CONDIDION,
56  		INLINE_SEPARATOR,
57  		ASSIGNMENT,
58  		PARENTHESIS_OPEN,
59  		PARENTHESIS_CLOSE,
60  		QUERY_OPEN,
61  		QUERY_CLOSE,
62  		TRANSFORMER_CALL,
63  		ERROR
64  	}
65  	
66  	private static final char EOC = 0;
67  	
68  	private final char[] code;
69  	private final int endOffset;
70  	
71  	private int offset = 0;
72  	private int line = 1;
73  	private int column = 1;
74  	private int escapeCount = 0;
75  
76  	private char lookAhead = EOC;
77  	private char c = EOC;
78  	
79  	private Type tokenType = Type.NONE;
80  	private Object tokenValue = null;
81  	private int tokenLine = 0;
82  	private int tokenColumn = 0;
83  	
84  	public Tokenizer(TemplateLocator locator, int line, int column, char[] code, int offset, int length) {
85  		super();
86  		
87  		this.line = line;
88  		this.column = column;
89  		
90  		this.code = code;
91  		this.offset = offset;
92  		this.endOffset = offset + length;
93  		escapeCount = 0;
94  		
95  		lookAhead = (offset+1 < endOffset) ? code[offset+1] : EOC;
96  		c = (offset < endOffset) ? code[offset] : EOC;
97  	}
98  	
99  	public Type getTokenType() {
100     	return tokenType;
101     }
102 
103 	public Object getTokenValue() {
104     	return tokenValue;
105     }
106 
107 	public int getTokenLine() {
108 		return tokenLine;
109 	}
110 
111 	public int getTokenColumn() {
112 		return tokenColumn;
113 	}
114 
115 	public Type readToken() throws TokenizerException {
116 		nextWhileWhitespace();
117 		
118 		tokenLine = line;
119 		tokenColumn = column;
120 		
121 		if (c == EOC) {
122 			tokenType = Type.NONE;
123 			tokenValue = "end of code";
124 			
125 			return tokenType;
126 		}
127 		else if (c == '$') {
128 			// read a name
129 			int start = offset;
130 			
131 			while (true) {
132 				next();
133 				
134 				if ((!isLetter(c)) && (!isDigit(c)) && (c != '_')) {
135 					tokenValue = new String(code, start, offset-start);
136 					
137 					if (KEYWORD_THIS.equals(tokenValue))
138 						tokenType = Type.KEYWORD_THIS;
139 					else if (KEYWORD_POP.equals(tokenValue))
140 						tokenType = Type.KEYWORD_POP;
141 					else
142 						tokenType = Type.VARIABLE;
143 					
144 					return tokenType;
145 				}
146 			}
147 		}
148 		else if ((isLetter(c)) || (c == '_')) {
149 			// read a name
150 			int start = offset;
151 			
152 			while (true) {
153 				next();
154 				
155 				if ((!isLetter(c)) && (!isDigit(c)) && (c != '_')) {
156 					tokenValue = new String(code, start, offset-start);
157 					
158 					if (KEYWORD_TRUE.equals(tokenValue))
159 						tokenType = Type.KEYWORD_TRUE;
160 					else if (KEYWORD_FALSE.equals(tokenValue))
161 						tokenType = Type.KEYWORD_FALSE;
162 					else if (KEYWORD_NULL.equals(tokenValue))
163 						tokenType = Type.KEYWORD_NULL;
164 					else
165 						tokenType = Type.IDENTIFIER;
166 					
167 					return tokenType;
168 				}
169 			}
170 		}  
171 		// numbers: 42, +42, -42, 3.14, -3.14, 0.1, .1, +.1, -.1, 1.1E14, -1.1E-14  
172 		else if ((isDigit(c)) || (c == '#')) {
173 			// read a number
174 			int start = offset;
175 			int startLine = line;
176 			int startColumn = column;
177 			boolean floatingpoint = false;
178 			
179 			while (true) {
180 				next();
181 				
182 				if ((c == '.') || (c == 'e') || (c == 'E')) 
183 					floatingpoint = true;
184 				
185 				if ((!isLetter(c)) && (!isDigit(c)) && (c != '.')) {
186 					String s = new String(code, start, offset-start);
187 
188 					if ((s.endsWith("d")) || (s.endsWith("D"))) {
189 						try {
190 							tokenValue = Double.valueOf(s.substring(0, s.length()-1));
191 						}
192 						catch (NumberFormatException e) {
193 							throw new TokenizerException(startLine, startColumn, "Could not parse double tokenValue \"" + s + "\"", e);
194 						}
195 					}
196 					else if ((s.endsWith("f")) || (s.endsWith("F"))) {
197 						try {
198 							tokenValue = Float.valueOf(s.substring(0, s.length()-1));
199 						}
200 						catch (NumberFormatException e) {
201 							throw new TokenizerException(startLine, startColumn, "Could not parse float tokenValue \"" + s + "\"", e);
202 						}
203 					}
204 					else if (floatingpoint) {
205 						try {
206 							tokenValue = Double.valueOf(s);
207 						}
208 						catch (NumberFormatException e) {
209 							throw new TokenizerException(startLine, startColumn, "Could not parse double tokenValue \"" + s + "\"", e);
210 						}
211 					}
212 					else if ((s.endsWith("l")) || (s.endsWith("L"))) {
213 						try {
214 							tokenValue = Long.decode(s.substring(0, s.length()-1));
215 						}
216 						catch (NumberFormatException e) {
217 							throw new TokenizerException(startLine, startColumn, "Could not parse long tokenValue \"" + s + "\"", e);
218 						}
219 					}
220 					else {
221 						try {
222 							tokenValue = Integer.decode(s);
223 						}
224 						catch (NumberFormatException e) {
225 							throw new TokenizerException(startLine, startColumn, "Could not parse integer tokenValue \"" + s + "\"", e);
226 						}
227 					}
228 
229 					tokenType = Type.NUMERIC;
230 					
231 					return tokenType;
232 				}
233 			}
234 		}
235 		else if (c == '\"') {
236 			int startLine = line;
237 			int startColumn = column;
238 			StringBuilder builder = new StringBuilder();
239 			
240 			while (true) {
241 				next();
242 
243 				if (c == EOC)
244 					throw new TokenizerException(startLine, startColumn, "Unclosed string");
245 				else if (c == '\"') {
246 					next();
247 					
248 					tokenType = Type.STRING;
249 					tokenValue = builder.toString();
250 					
251 					return tokenType;
252 				}
253 				else if (c == '\\') {
254 					next();
255 					
256 					if (c == EOC)
257 						throw new TokenizerException(startLine, startColumn, "Unclosed string");
258 
259 					switch (c) {
260 						case '\\':
261 							builder.append("\\");
262 							break;
263 							
264 						case '\"':
265 							builder.append("\"");
266 							break;
267 							
268 						case '\'':
269 							builder.append("\'");
270 							break;
271 							
272 						case '\r':
273 							builder.append("\r");
274 							break;
275 							
276 						case '\t':
277 							builder.append("\t");
278 							break;
279 							
280 						case '\n':
281 							builder.append("\n");
282 							break;
283 							
284 						default:
285 							throw new TokenizerException(line, column, "Invalid escape sequence \"\\" + c + "\"");
286 					}
287 				}
288 				else
289 					builder.append(c);
290 			}
291 		}
292 		else if (c == '.') {
293 			tokenType = Type.DELEGATOR;
294 			tokenValue = String.valueOf(c);
295 			
296 			next();
297 
298 			return tokenType;
299 		}
300 		else if (c == ',') {
301 			tokenType = Type.SEPARATOR;
302 			tokenValue = String.valueOf(c);
303 			
304 			next();
305 
306 			return tokenType;
307 		}
308 		else if (isUnaryOperator()) {
309 			tokenType = Type.UNARY_OPERATOR;
310 			tokenValue = String.valueOf(c);
311 			
312 			next();
313 			
314 			return tokenType;
315 		}
316 		else if (isMultiplicativeOperator()) {
317 			tokenType = Type.MULTIPLICATIVE_OPERATOR;
318 			tokenValue = String.valueOf(c);
319 			
320 			next();
321 			
322 			return tokenType;
323 		}
324 		else if (isAdditiveOperator()) {
325 			tokenType = Type.ADDITIVE_OPERATOR;
326 			tokenValue = String.valueOf(c);
327 			
328 			next();
329 			
330 			return tokenType;
331 		}
332 		else if (isShiftOperator()) {
333 			tokenType = Type.SHIFT_OPERATOR;
334 			
335 			if (c == '>') {
336 				next();
337 				
338 				if (lookAhead == '>') {
339 					next();
340 					tokenValue = ">>>";
341 				}
342 				else {
343 					tokenValue = ">>";
344 				}
345 			}
346 			else {
347 				next();
348 				
349 				tokenValue = "<<";
350 			}
351 			
352 			next();
353 			
354 			return tokenType;
355 		}
356 		else if (isRelationalOperator()) {
357 			tokenType = Type.RELATIONAL_OPERATOR;
358 			
359 			if (lookAhead == '=') {
360 				tokenValue = String.valueOf(c) + String.valueOf(lookAhead);
361 				
362 				next();
363 			}
364 			else {
365 				tokenValue = String.valueOf(c);
366 			}
367 			
368 			next();
369 			
370 			return tokenType;
371 		}
372 		else if (isEqualityOperator()) {
373 			tokenType = Type.EQUALITY_OPERATOR;
374 			tokenValue = String.valueOf(c) + String.valueOf(lookAhead);
375 			
376 			next();
377 			next();
378 			
379 			return tokenType;
380 		}
381 		else if (isBitwiseOperator()) {
382 			tokenType = Type.BITWISE_OPERATOR;
383 			tokenValue = String.valueOf(c);
384 			
385 			next();
386 			
387 			return tokenType;
388 		}
389 		else if (isConditionalAndOperator()) {
390 			tokenType = Type.CONDITIONAL_AND_OPERATOR;
391 			tokenValue = "&&";
392 			
393 			next();
394 			next();
395 			
396 			return tokenType;
397 		}
398 		else if (isConditionalOrOperator()) {
399 			tokenType = Type.CONDITIONAL_OR_OPERATOR;
400 			tokenValue = "||";
401 			
402 			next();
403 			next();
404 			
405 			return tokenType;
406 		}
407 		else if (c == '?') {
408 			tokenType = Type.INLINE_CONDIDION;
409 			tokenValue = "?";
410 			
411 			next();
412 			
413 			return tokenType;
414 		}
415 		else if (c == ':') {
416 			tokenType = Type.INLINE_SEPARATOR;
417 			tokenValue = ":";
418 			
419 			next();
420 			
421 			return tokenType;
422 		}
423 		else if (isAssignment()) {
424 			tokenType = Type.ASSIGNMENT;
425 			tokenValue = "=";
426 			
427 			next();
428 			
429 			return tokenType;
430 		}
431 		else if (c == '(') {
432 			tokenType = Type.PARENTHESIS_OPEN;
433 			tokenValue = String.valueOf(c);
434 			
435 			next();
436 			
437 			return tokenType;
438 		}
439 		else if (c == ')') {
440 			tokenType = Type.PARENTHESIS_CLOSE;
441 			tokenValue = String.valueOf(c);
442 			
443 			next();
444 			
445 			return tokenType;
446 		}
447 		else if (c == '[') {
448 			tokenType = Type.QUERY_OPEN;
449 			tokenValue = String.valueOf(c);
450 			
451 			next();
452 			
453 			return tokenType;
454 		}
455 		else if (c == ']') {
456 			tokenType = Type.QUERY_CLOSE;
457 			tokenValue = String.valueOf(c);
458 			
459 			next();
460 			
461 			return tokenType;
462 		}
463 		else if (isTransformerCall()) {
464 			tokenType = Type.TRANSFORMER_CALL;
465 			tokenValue = "->";
466 			
467 			next();
468 			next();
469 			
470 			return tokenType;
471 		}
472 		else {
473 			tokenType = Type.ERROR;
474 			tokenValue = String.valueOf(c);
475 			
476 			return tokenType;
477 		}
478 	}
479 
480 	/**
481 	 * Parses the next token as key identifier
482 	 * 
483 	 * KeyIdentifier = (LETTER | DIGIT | "_" | "-" | "." | "/") {LETTER | DIGIT | "_" | "-" | "." | "/"}
484 	 * 
485 	 * @return the next token
486 	 * @throws TokenizerException on occasion
487 	 */
488 	public Type readKeyToken() throws TokenizerException {
489 		nextWhileWhitespace();
490 		
491 		if (c == EOC) {
492 			tokenType = Type.NONE;
493 			tokenValue = "end of code";
494 			
495 			return tokenType;
496 		}
497 		else if (isKeyIdentifier()) {
498 			// read a key
499 			int start = offset;
500 			
501 			while (true) {
502 				next();
503 				
504 				if (!isKeyIdentifier()) {
505 					tokenValue = new String(code, start, offset-start);
506 					tokenType = Type.KEY;
507 					
508 					return tokenType;
509 				}
510 			}
511 		}  
512 		else
513 			return readToken();
514 	}
515 
516 	/**
517 	 * Goes to the next character. Sets c and lookAhead. Counts the offset, 
518 	 * line and column. Sets escaped to true if the current character 
519 	 * has been escaped. Transforms \r\n and \r to \n.
520 	 * c id EOD if it's the end of the data.
521 	 * @return true if there is another character, false otherwise
522 	 */
523 	private boolean next() {
524 		offset += 1;
525 		column += 1;
526 		
527 		c = lookAhead;
528 		
529 		lookAhead = (offset+1 < endOffset) ? code[offset+1] : EOC;
530 
531 		if (c == '\\')
532 			escapeCount += 1;
533 		else if (escapeCount > 0)
534 			escapeCount = 0;
535 
536 		if (c == EOC)
537 			return false;
538 		else if ((c == '\n') || ((c == '\r') && (lookAhead != '\n'))) {
539 			line += 1;
540 			column = 1;
541 		}
542 		
543 		return true;
544 	}
545 	
546 	private boolean nextWhileWhitespace() {
547 		while (isWhitespace())
548 			if (!next())
549 				return false;
550 		
551 		return true;
552 	}
553 	
554 	private boolean isWhitespace() {
555 		return (c == ' ') || (c == '\t') || (c == '\r') || (c == '\n');
556 	}
557 	
558 	private boolean isLetter(char c) {
559 		return
560 			((c >= 'a') && (c <= 'z')) ||
561 			((c >= 'A') && (c <= 'Z'))
562 		;
563 	}
564 	
565 	private boolean isDigit(char c) {
566 		return (c >= '0') && (c <= '9');
567 	}
568 	
569 	private boolean isUnaryOperator() {
570 		return ( 
571 			(c == '+') || ((c == '-') && (lookAhead != '>')) || 
572 			(c == '~') || ((c == '!') && (lookAhead != '='))
573 		) && (
574 			(tokenType == Type.NONE) || (tokenType == Type.UNARY_OPERATOR) ||
575 			(tokenType == Type.MULTIPLICATIVE_OPERATOR) || (tokenType == Type.ADDITIVE_OPERATOR) || 
576 			(tokenType == Type.SHIFT_OPERATOR) || (tokenType == Type.RELATIONAL_OPERATOR) || 
577 			(tokenType == Type.EQUALITY_OPERATOR) || (tokenType == Type.BITWISE_OPERATOR) || 
578 			(tokenType == Type.CONDITIONAL_AND_OPERATOR) || (tokenType == Type.CONDITIONAL_OR_OPERATOR) || 
579 			(tokenType == Type.PARENTHESIS_OPEN) || 
580 			(tokenType == Type.TRANSFORMER_CALL) ||
581 			(tokenType == Type.QUERY_OPEN)
582 		);
583 	}
584 	
585 	private boolean isMultiplicativeOperator() {
586 		return (c == '*') || (c == '/') || (c == '%');
587 	}
588 	
589 	private boolean isAdditiveOperator() {
590 		return (c == '+') || ((c == '-') && (lookAhead != '>'));
591 	}
592 	
593 	private boolean isShiftOperator() {
594 		return 
595 			((c == '<') && (lookAhead == '<')) || 
596 			((c == '>') && (lookAhead == '>'))
597 		;
598 	}
599 	
600 	private boolean isRelationalOperator() {
601 		return (c == '<') || (c == '>');
602 	}
603 	
604 	private boolean isEqualityOperator() {
605 		return ((c == '=') || (c == '!')) && (lookAhead == '='); 
606 	}
607 	
608 	private boolean isBitwiseOperator() {
609 		return 
610 			((c == '&') && (lookAhead != '&')) || 
611 			(c == '^') || 
612 			((c == '|') && (lookAhead != '|')) 
613 		;
614 	}
615 	
616 	private boolean isConditionalAndOperator() {
617 		return ((c == '&') && (lookAhead == '&')); 
618 	}
619 	
620 	private boolean isConditionalOrOperator() {
621 		return ((c == '|') && (lookAhead == '|')); 
622 	}
623 	
624 	private boolean isAssignment() {
625 		return (c == '=') && (lookAhead != '=');
626 	}
627 	
628 	private boolean isTransformerCall() {
629 		return ((c == '-') && (lookAhead == '>'));
630 	}
631 
632 	private boolean isKeyIdentifier() {
633 		return ((isLetter(c)) || (isDigit(c)) || (c == '_') || (c == '-') || (c == '.') || (c == '/'));
634 	}
635 	
636 	@Override
637 	public String toString() {
638 		return String.format(
639 			"%-24s [%-24s]", getTokenType(), getTokenValue()
640 		);
641 	}
642 }