1- from scrapeops_scrapy .core .api import SOPSRequest
2- from scrapeops_scrapy .normalizer .domains import DomainNormalizer
3- from scrapeops_scrapy .utils import utils
41import json
52import logging
63import re
4+ import time
5+
6+ from scrapeops_scrapy .core .api import SOPSRequest
7+ from scrapeops_scrapy .normalizer .domains import DomainNormalizer
8+ from scrapeops_scrapy .utils import utils
79
8- class ErrorLogger (object ):
910
11+ class ErrorLogger (object ):
1012 ERROR_LOGGER_ACTIVE = True
1113
1214 def __init__ (self , spider , crawler , spider_settings , server_hostname , server_ip , start_time , log_file ):
1315 self .spider = spider
1416 self .crawler = crawler
15- self .bot_name = crawler .settings .get (' BOT_NAME' , ' None' )
17+ self .bot_name = crawler .settings .get (" BOT_NAME" , " None" )
1618 self .spider_settings = spider_settings
1719 self .server_hostname = server_hostname
1820 self .server_ip = server_ip
@@ -28,72 +30,70 @@ def update_error_logger(self, job_name, job_id):
2830
2931 def log_error (self , reason = None , error = None , data = None , request_type = None ):
3032 if ErrorLogger .ERROR_LOGGER_ACTIVE :
31- self ._error_history .append ({
32- 'time' : utils .current_time (),
33- 'reason' : reason ,
34- 'error' : str (error ),
35- 'data' : data ,
36- 'request_type' : request_type ,
37- })
38-
33+ self ._error_history .append (
34+ {
35+ "time" : utils .current_time (),
36+ "reason" : reason ,
37+ "error" : str (error ),
38+ "data" : data ,
39+ "request_type" : request_type ,
40+ }
41+ )
3942
4043 def send_error_report (self , error_type = None , body = None , log_data = False ):
4144 if ErrorLogger .ERROR_LOGGER_ACTIVE :
4245 try :
4346 data , status = SOPSRequest ().error_report_request (error_type = error_type , body = body )
4447 if status .valid :
45- if log_data and self .log_file is not None and data .get (' sdk_error_id' ) is not None :
46- with open (self .log_file , 'rb' ) as f :
48+ if log_data and self .log_file is not None and data .get (" sdk_error_id" ) is not None :
49+ with open (self .log_file , "rb" ) as f :
4750 post_body = {
48- 'sops_sdk' : 'scrapy' ,
49- 'spider_name' : self .spider .name ,
50- 'job_group_id' : self .job_group_id ,
51- 'job_group_name' : self .job_group_name ,
52- 'sdk_error_id' : data .get ('sdk_error_id' )
53- }
54- _ , status = SOPSRequest ().error_report_request (error_type = error_type , body = post_body , files = {'file' : f })
51+ "sops_sdk" : "scrapy" ,
52+ "spider_name" : self .spider .name ,
53+ "job_group_id" : self .job_group_id ,
54+ "job_group_name" : self .job_group_name ,
55+ "sdk_error_id" : data .get ("sdk_error_id" ),
56+ }
57+ _ , status = SOPSRequest ().error_report_request (
58+ error_type = error_type , body = post_body , files = {"file" : f }
59+ )
5560 if status .valid is False :
56- self .log_error (reason = ' send_error_logs_failed' , error = status .error )
61+ self .log_error (reason = " send_error_logs_failed" , error = status .error )
5762
5863 if status .valid is False :
59- self .log_error (reason = ' send_error_report_failed' , error = status .error )
64+ self .log_error (reason = " send_error_report_failed" , error = status .error )
6065 except Exception :
6166 pass
6267
63-
6468 def sdk_error_close (self , reason = None , error = None , request_type = None , data = None ):
6569 if ErrorLogger .ERROR_LOGGER_ACTIVE :
6670 self .log_error (reason = reason , error = error , data = data , request_type = request_type )
6771 error_data = {
68- ' final_reason' : reason ,
69- ' sops_sdk' : ' scrapy' ,
70- ' spider_name' : self .spider .name ,
71- ' bot_name' : self .bot_name ,
72- ' server_ip' : self .server_ip ,
73- ' server_hostname' : self .server_hostname ,
74- ' job_group_id' : self .job_group_id ,
75- ' job_group_name' : self .job_group_name ,
76- ' job_args' : utils .get_args (),
77- ' job_start_time' : self .start_time ,
78- ' sops_scrapeops_version' : utils .get_scrapeops_version (),
79- ' sops_scrapy_version' : utils .get_scrapy_version (),
80- ' sops_python_version' : utils .get_python_version (),
81- ' sops_system_version' : utils .get_system_version (),
82- ' sops_middleware_enabled' : utils .scrapeops_middleware_installed (self .spider_settings ),
83- ' error_history' : self ._error_history ,
72+ " final_reason" : reason ,
73+ " sops_sdk" : " scrapy" ,
74+ " spider_name" : self .spider .name ,
75+ " bot_name" : self .bot_name ,
76+ " server_ip" : self .server_ip ,
77+ " server_hostname" : self .server_hostname ,
78+ " job_group_id" : self .job_group_id ,
79+ " job_group_name" : self .job_group_name ,
80+ " job_args" : utils .get_args (),
81+ " job_start_time" : self .start_time ,
82+ " sops_scrapeops_version" : utils .get_scrapeops_version (),
83+ " sops_scrapy_version" : utils .get_scrapy_version (),
84+ " sops_python_version" : utils .get_python_version (),
85+ " sops_system_version" : utils .get_system_version (),
86+ " sops_middleware_enabled" : utils .scrapeops_middleware_installed (self .spider_settings ),
87+ " error_history" : self ._error_history ,
8488 }
85-
86- self .send_error_report (error_type = 'sdk_close' , body = error_data , log_data = True )
87-
8889
89-
90+ self . send_error_report ( error_type = "sdk_close" , body = error_data , log_data = True )
9091
91- class TailLogHandler (logging .Handler ):
9292
93+ class TailLogHandler (logging .Handler ):
9394 retryErrors = [
9495 "Couldn't bind" ,
95- "Hostname couldn't be looked up'"
96- "No route to host" ,
96+ "Hostname couldn't be looked up'" "No route to host" ,
9797 "Connection was refused by other side" ,
9898 "TCP connection timed out" ,
9999 "File used for UNIX socket is no good" ,
@@ -124,123 +124,117 @@ def __init__(self, log_dict, log_dict_cumulative):
124124 self .log_dict = log_dict
125125 self .log_dict_cumulative = log_dict_cumulative
126126
127-
128127 def flush (self ):
129128 self .log_dict .clear ()
130-
131129
132130 def emit (self , record ):
133-
134131 try :
135-
136- if (record .levelname == "ERROR" or record .levelname == "WARNING" or record .levelname == "CRITICAL" ):
137-
138- if hasattr (record , 'message' ):
132+ if record .levelname == "ERROR" or record .levelname == "WARNING" or record .levelname == "CRITICAL" :
133+ if hasattr (record , "message" ):
139134 errorMessage = record .message
140- fileAndLine = record .pathname + ' , line: ' + str (record .lineno )
141- dateTime = record . asctime
135+ fileAndLine = record .pathname + " , line: " + str (record .lineno )
136+ dateTime = self . format_time ( record )
142137 type = record .levelname
143138 engine = record .name
144139
145-
146- #covering warnings/probableCause/traceback missing
147- traceback = 'No traceback available'
148- probableCause = ''
140+ # covering warnings/probableCause/traceback missing
141+ traceback = "No traceback available"
142+ probableCause = ""
149143
150144 if record .exc_text is not None :
151145 traceback = record .exc_text
152- splitTraceback = traceback .split (' \n ' )
146+ splitTraceback = traceback .split (" \n " )
153147 probableCause = splitTraceback [len (splitTraceback ) - 1 ]
154148
155-
156- #covering retrys
157- if ("Gave up retrying <" in record .message ):
158-
149+ # covering retrys
150+ if "Gave up retrying <" in record .message :
159151 for retryError in self .retryErrors :
160- if ( retryError in record .message ) :
161- method = record .message .split ('<' )[1 ].split (' ' )[0 ]
152+ if retryError in record .message :
153+ method = record .message .split ("<" )[1 ].split (" " )[0 ]
162154 errorMessage = "Error: Gave up retrying " + method + " request - " + retryError
163- fileAndLine = ''
155+ fileAndLine = ""
164156 probableCause = retryError
165157 break
166-
158+
167159 # Deprecation Warnings
168160 if "ScrapyDeprecationWarning:" in record .message and record .message [0 ] == "/" :
169161 splitString = record .message .split ("ScrapyDeprecationWarning:" )
170162 errorMessage = "ScrapyDeprecationWarning: " + splitString [1 ]
171163 probableCause = splitString [0 ]
172164
173-
174165 # "Some Other Error Occurred"
175- if "Some other error occurred: " in record .message :
176- splitError = record .message .split (' /' )
166+ if "Some other error occurred: " in record .message :
167+ splitError = record .message .split (" /" )
177168 cleanError = splitError [0 ].split (">: " )[1 ]
178169 errorMessage = "Some other error occurred: " + cleanError
179170 probableCause = cleanError
180171 traceback = record .message
181172
182-
183173 # Convert Urls To Domains in Error Messages
184- urls = re .findall (r' (https?://[^\s]+)' , errorMessage )
174+ urls = re .findall (r" (https?://[^\s]+)" , errorMessage )
185175 for url in urls :
186176 domain = DomainNormalizer .get_domain (url )
187177 errorMessage = errorMessage .replace (url , domain )
188178
189-
190179 if errorMessage in self .log_dict :
191- self .log_dict [errorMessage ][' count' ] = self .log_dict [errorMessage ][' count' ] + 1
180+ self .log_dict [errorMessage ][" count" ] = self .log_dict [errorMessage ][" count" ] + 1
192181 else :
193182 self .log_dict [errorMessage ] = {
194- 'type' : type ,
195- 'engine' : engine ,
196- 'name' : errorMessage ,
197- 'count' : 1 ,
198- 'traceback' : traceback ,
199- 'message' : probableCause ,
200- 'filepath' : fileAndLine ,
201- 'dateTime' : dateTime
202- }
203-
204- if (SOPSRequest .HIGH_FREQ_ACC == True ):
205-
206- if (errorMessage in self .log_dict_cumulative ):
207- self .log_dict_cumulative [errorMessage ]['count' ] = self .log_dict_cumulative [errorMessage ]['count' ] + 1
183+ "type" : type ,
184+ "engine" : engine ,
185+ "name" : errorMessage ,
186+ "count" : 1 ,
187+ "traceback" : traceback ,
188+ "message" : probableCause ,
189+ "filepath" : fileAndLine ,
190+ "dateTime" : dateTime ,
191+ }
192+
193+ if SOPSRequest .HIGH_FREQ_ACC == True :
194+ if errorMessage in self .log_dict_cumulative :
195+ self .log_dict_cumulative [errorMessage ]["count" ] = (
196+ self .log_dict_cumulative [errorMessage ]["count" ] + 1
197+ )
208198 else :
209-
210- self .log_dict_cumulative [errorMessage ] = {
211- 'type' : type ,
212- 'engine' : engine ,
213- 'name' : errorMessage ,
214- 'count' : 1 ,
215- 'traceback' : traceback ,
216- 'message' : probableCause ,
217- 'filepath' : fileAndLine ,
218- 'dateTime' : dateTime
199+ self .log_dict_cumulative [errorMessage ] = {
200+ "type" : type ,
201+ "engine" : engine ,
202+ "name" : errorMessage ,
203+ "count" : 1 ,
204+ "traceback" : traceback ,
205+ "message" : probableCause ,
206+ "filepath" : fileAndLine ,
207+ "dateTime" : dateTime ,
219208 }
220-
209+
221210 except Exception as e :
222- logging .info (' Error: Error in error logger' )
211+ logging .info (" Error: Error in error logger" )
223212 logging .info (e , exc_info = True )
224213
225- class TailLogger (object ):
214+ def format_time (self , record ):
215+ if self .formatter :
216+ return self .formatter .formatTime (record )
217+ else :
218+ # Fallback to a basic time format if no formatter is set
219+ return time .strftime ("%Y-%m-%d %H:%M:%S" )
220+
226221
222+ class TailLogger (object ):
227223 def __init__ (self ):
228224 self ._log_dict = {}
229225 self ._log_dict_cumulative = {}
230226 self ._log_handler = TailLogHandler (self ._log_dict , self ._log_dict_cumulative )
231227
232- def contents (self , type = "diff" ):
233-
234- if (type == "cumulative" ):
235- jsonLogsCumulative = json .dumps (self ._log_dict_cumulative , indent = 2 )
228+ def contents (self , type = "diff" ):
229+ if type == "cumulative" :
230+ jsonLogsCumulative = json .dumps (self ._log_dict_cumulative , indent = 2 )
236231 return jsonLogsCumulative
237232
238233 else :
239- jsonLogs = json .dumps (self ._log_dict , indent = 2 )
234+ jsonLogs = json .dumps (self ._log_dict , indent = 2 )
240235 self ._log_handler .flush ()
241236 return jsonLogs
242237
243238 @property
244239 def log_handler (self ):
245240 return self ._log_handler
246-
0 commit comments