Skip to content
Projeler
Gruplar
Parçacıklar
Yardım
Yükleniyor...
Oturum aç / Kaydol
Gezinmeyi değiştir
C
cpython
Proje
Proje
Ayrıntılar
Etkinlik
Cycle Analytics
Depo (repository)
Depo (repository)
Dosyalar
Kayıtlar (commit)
Dallar (branch)
Etiketler
Katkıda bulunanlar
Grafik
Karşılaştır
Grafikler
Konular (issue)
0
Konular (issue)
0
Liste
Pano
Etiketler
Kilometre Taşları
Birleştirme (merge) Talepleri
0
Birleştirme (merge) Talepleri
0
CI / CD
CI / CD
İş akışları (pipeline)
İşler
Zamanlamalar
Grafikler
Paketler
Paketler
Wiki
Wiki
Parçacıklar
Parçacıklar
Üyeler
Üyeler
Collapse sidebar
Close sidebar
Etkinlik
Grafik
Grafikler
Yeni bir konu (issue) oluştur
İşler
Kayıtlar (commit)
Konu (issue) Panoları
Kenar çubuğunu aç
Batuhan Osman TASKAYA
cpython
Commits
fc6f5339
Kaydet (Commit)
fc6f5339
authored
Mar 07, 1997
tarafından
Guido van Rossum
Dosyalara gözat
Seçenekler
Dosyalara Gözat
İndir
Eposta Yamaları
Sade Fark
Ka-Ping's version.
üst
19700b6a
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
132 additions
and
45 deletions
+132
-45
tokenize.py
Lib/tokenize.py
+132
-45
No files found.
Lib/tokenize.py
Dosyayı görüntüle @
fc6f5339
# This module compiles a regular expression that recognizes Python tokens.
# It is designed to match the working of the Python tokenizer exactly.
# It takes care of everything except indentation;
# note that un-escaped newlines are tokens, too.
# tokenprog.regs[3] gives the location of the token without whitespace
# It also defines various subexpressions, but doesn't compile them.
# See the function test() below for an example of how to use.
"""tokenize.py (Ka-Ping Yee, 4 March 1997)
import
regex
This module compiles a regular expression that recognizes Python tokens
in individual lines of text. The regular expression handles everything
except indentation, continuations, and triple-quoted strings. The function
'tokenize.tokenize()' takes care of these things for streams of text. It
accepts a file-like object and a function, uses the readline() method to
scan the file, and calls the function called once for each token found
passing its type, a string containing the token, the line number, the line,
and the starting and ending positions of the token within the line.
It is designed to match the working of the Python tokenizer exactly."""
# Note: to get a quoted backslash in a regexp, it must be quadrupled.
import
string
,
regex
from
token
import
*
Ignore
=
'[
\t
]*
\
(
\\\\\n
[
\t
]*
\
)*
\
(#.*
\
)?
'
def
group
(
*
choices
):
return
'
\
('
+
string
.
join
(
choices
,
'
\
|'
)
+
'
\
)
'
Ignore
=
'[
\f\t
]*
\
([
\
]
\r
?
\n
[
\t
]*
\
)*
\
(#.*
\
)?'
Name
=
'[a-zA-Z_][a-zA-Z0-9_]*'
Hexnumber
=
'0[xX][0-9a-fA-F]*[lL]?'
Octnumber
=
'0[0-7]*[lL]?'
Decnumber
=
'[1-9][0-9]*[lL]?'
Intnumber
=
Hexnumber
+
'
\
|'
+
Octnumber
+
'
\
|'
+
Decnumber
Intnumber
=
group
(
Hexnumber
,
Octnumber
,
Decnumber
)
Exponent
=
'[eE][-+]?[0-9]+'
Pointfloat
=
'
\
([0-9]+
\
.[0-9]*
\
|
\
.[0-9]+
\
)
\
('
+
Exponent
+
'
\
)
?'
Pointfloat
=
group
(
'[0-9]+
\
.[0-9]*'
,
'
\
.[0-9]+'
)
+
group
(
Exponent
)
+
'
?'
Expfloat
=
'[0-9]+'
+
Exponent
Floatnumber
=
Pointfloat
+
'
\
|'
+
Expfloat
Number
=
Floatnumber
+
'
\
|'
+
Intnumber
Floatnumber
=
group
(
Pointfloat
,
Expfloat
)
Number
=
group
(
Floatnumber
,
Intnumber
)
String
=
'
\'
\
(
\\\\
.
\
|[^
\\\n\'
]
\
)*
\'
'
+
'
\
|'
+
'"
\
(
\\\\
.
\
|[^
\\\n
"]
\
)*"'
# Note: this module *recognizes* double quotes, but for backward
# compatibility, it doesn't *use* them!
Single
=
group
(
'^
\'
'
,
'[^
\
]
\'
'
)
Double
=
group
(
'^"'
,
'[^
\
]"'
)
Tsingle
=
group
(
'^
\'\'\'
'
,
'[^
\
]
\'\'\'
'
)
Tdouble
=
group
(
'^"""'
,
'[^
\
]"""'
)
Triple
=
group
(
'
\'\'\'
'
,
'"""'
)
String
=
group
(
'
\'
'
+
group
(
'[
\
].'
,
'[^
\'
\
]'
)
+
'*'
+
group
(
'
\'
'
,
'[
\
]
\n
'
),
'"'
+
group
(
'[
\
].'
,
'[^"
\
]'
)
+
'*'
+
group
(
'"'
,
'[
\
]
\n
'
))
Operator
=
'~
\
|
\
+
\
|-
\
|
\
*
\
|/
\
|
%
\
|
\
^
\
|&
\
||
\
|<<
\
|>>
\
|==
\
|<=
\
|<>
\
|!=
\
|>=
\
|=
\
|<
\
|>'
Operator
=
group
(
'
\
+'
,
'
\
-'
,
'
\
*
\
*'
,
'
\
*'
,
'
\
^'
,
'~'
,
'/'
,
'
%
'
,
'&'
,
'|'
,
'<<'
,
'>>'
,
'=='
,
'<='
,
'<>'
,
'!='
,
'>='
,
'='
,
'<'
,
'>'
)
Bracket
=
'[][(){}]'
Special
=
'[:;.,`
\n
]'
Funny
=
Operator
+
'
\
|'
+
Bracket
+
'
\
|'
+
Special
Special
=
group
(
'[
\
]?
\r
?
\n
'
,
'[:;.,`
\f
]'
)
Funny
=
group
(
Operator
,
Bracket
,
Special
)
PlainToken
=
Name
+
'
\
|'
+
Number
+
'
\
|'
+
String
+
'
\
|'
+
Funny
Token
=
Ignore
+
'
\
('
+
PlainToken
+
'
\
)'
PlainToken
=
group
(
Name
,
Number
,
Triple
,
String
,
Funny
)
Token
=
Ignore
+
PlainToken
try
:
save_syntax
=
regex
.
set_syntax
(
0
)
# Use default syntax
tokenprog
=
regex
.
compile
(
Token
)
save_syntax
=
regex
.
set_syntax
(
0
)
# use default syntax
tokenprog
=
regex
.
compile
(
Token
)
endprogs
=
{
'
\'
'
:
regex
.
compile
(
Single
),
'"'
:
regex
.
compile
(
Double
),
'
\'\'\'
'
:
regex
.
compile
(
Tsingle
),
'"""'
:
regex
.
compile
(
Tdouble
)
}
finally
:
if
save_syntax
!=
0
:
dummy
=
regex
.
set_syntax
(
save_syntax
)
# Restore original syntax
def
test
(
file
):
f
=
open
(
file
,
'r'
)
while
1
:
line
=
f
.
readline
()
if
not
line
:
break
i
,
n
=
0
,
len
(
line
)
while
i
<
n
:
j
=
tokenprog
.
match
(
line
,
i
)
if
j
<
0
:
print
'No token at'
,
`line[i:i+20]`
+
'...'
i
=
i
+
1
else
:
i
=
i
+
j
a
,
b
=
tokenprog
.
regs
[
3
]
if
a
<
b
:
print
'Token:'
,
`line[a:b]`
regex
.
set_syntax
(
save_syntax
)
# restore original syntax
tabsize
=
8
TokenError
=
'TokenError'
def
printtoken
(
type
,
string
,
linenum
,
line
,
start
,
end
):
# for testing
print
`linenum`
+
':'
,
tok_name
[
type
],
repr
(
string
)
def
tokenize
(
readline
,
tokeneater
=
printtoken
):
linenum
=
parenlev
=
continued
=
0
namechars
,
numchars
=
string
.
letters
+
'_'
,
string
.
digits
contstr
=
''
indents
=
[
0
]
while
1
:
# loop over lines in stream
line
=
readline
()
linenum
=
linenum
+
1
if
line
[
-
2
:]
==
'
\r\n
'
:
line
=
line
[:
-
2
]
+
'
\n
'
pos
,
max
=
0
,
len
(
line
)
if
contstr
:
# continued string
if
not
line
:
raise
TokenError
,
"EOF within multi-line string"
if
contstr
[
-
2
:]
==
'
\\\n
'
:
contstr
=
contstr
[:
-
2
]
+
'
\n
'
if
endprog
.
search
(
line
)
>=
0
:
pos
=
end
=
endprog
.
regs
[
0
][
1
]
tokeneater
(
STRING
,
contstr
+
line
[:
end
],
linenum
,
line
,
0
,
0
)
contstr
=
''
else
:
contstr
=
contstr
+
line
continue
elif
parenlev
==
0
and
not
continued
:
# this is a new statement
if
not
line
:
break
column
=
0
while
1
:
# measure leading whitespace
if
line
[
pos
]
==
' '
:
column
=
column
+
1
elif
line
[
pos
]
==
'
\t
'
:
column
=
(
column
/
tabsize
+
1
)
*
tabsize
elif
line
[
pos
]
==
'
\f
'
:
column
=
0
else
:
break
pos
=
pos
+
1
if
line
[
pos
]
in
'#
\n
'
:
continue
# skip comments or blank lines
if
column
>
indents
[
-
1
]:
# count indents or dedents
indents
.
append
(
column
)
tokeneater
(
INDENT
,
'
\t
'
,
linenum
,
line
,
0
,
0
)
while
column
<
indents
[
-
1
]:
indents
=
indents
[:
-
1
]
tokeneater
(
DEDENT
,
'
\t
'
,
linenum
,
line
,
0
,
0
)
else
:
# continued statement
if
not
line
:
raise
TokenError
,
"EOF within multi-line statement"
continued
=
0
while
pos
<
max
:
if
tokenprog
.
match
(
line
,
pos
)
>
0
:
# scan for tokens
start
,
end
=
tokenprog
.
regs
[
3
]
token
=
line
[
start
:
end
]
pos
=
end
if
token
[
0
]
in
namechars
:
# ordinary name
tokeneater
(
NAME
,
token
,
linenum
,
line
,
start
,
end
)
elif
token
[
0
]
in
numchars
:
# ordinary number
tokeneater
(
NUMBER
,
token
,
linenum
,
line
,
start
,
end
)
elif
token
in
(
'
\'\'\'
'
,
'"""'
):
# triple-quoted
endprog
=
endprogs
[
token
]
if
endprog
.
search
(
line
,
pos
)
>=
0
:
# all on one line
pos
=
endprog
.
regs
[
0
][
1
]
tokeneater
(
STRING
,
token
,
linenum
,
line
,
start
,
pos
)
else
:
contstr
=
line
[
start
:]
# multiple lines
break
elif
token
[
0
]
in
'
\'
"'
:
if
token
[
-
1
]
==
'
\n
'
:
# continued string
endprog
,
contstr
=
endprogs
[
token
[
0
]],
line
[
start
:]
break
else
:
# ordinary string
tokeneater
(
STRING
,
token
,
linenum
,
line
,
start
,
end
)
elif
token
[
0
]
==
'
\n
'
:
tokeneater
(
NEWLINE
,
token
,
linenum
,
line
,
start
,
end
)
elif
token
[
0
]
==
'
\\
'
:
# continued stmt
continued
=
1
else
:
if
token
[
0
]
in
'([{'
:
parenlev
=
parenlev
+
1
if
token
[
0
]
in
')]}'
:
parenlev
=
parenlev
-
1
tokeneater
(
OP
,
token
,
linenum
,
line
,
start
,
end
)
else
:
tokeneater
(
ERRORTOKEN
,
line
[
pos
],
linenum
,
line
,
pos
,
pos
+
1
)
pos
=
pos
+
1
for
indent
in
indents
[
1
:]:
# pop remaining indent levels
tokeneater
(
DEDENT
,
'
\t
'
,
linenum
,
line
,
0
,
0
)
if
__name__
==
'__main__'
:
# testing
import
sys
file
=
open
(
sys
.
argv
[
-
1
])
tokenize
(
file
.
readline
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment