-
Notifications
You must be signed in to change notification settings - Fork 3
767_count_domens #788
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: dev
Are you sure you want to change the base?
767_count_domens #788
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,11 +9,13 @@ class ReferencesToLiteratureCheck(BaseReportCriterion): | |
| _description = '' | ||
| id = 'literature_references' | ||
|
|
||
| def __init__(self, file_info, min_ref=1, max_ref=1000, headers_map=None): | ||
| def __init__(self, file_info, min_ref=1, max_ref=1000, max_count_domains = 5,headers_map=None): | ||
| super().__init__(file_info) | ||
| self.headers = [] | ||
| self.literature_header = None | ||
| self.literature_reference_text = [] | ||
| self.literature_domains = [] | ||
| self.max_count_domains = max_count_domains | ||
| self.name_pattern = r'список[ \t]*(использованных|использованной|)[ \t]*(источников|литературы)' | ||
| if headers_map: | ||
| self.config = headers_map | ||
|
|
@@ -59,7 +61,8 @@ def check(self): | |
| return answer(False, | ||
| f'В Списке использованных источников не найдено ни одного источника.<br><br>Проверьте корректность использования нумированного списка.') | ||
|
|
||
| duplicates = self.checking_duplicate_sources() | ||
| duplicates_ref = self.checking_duplicate_sources(self.literature_reference_text, 2) | ||
| duplicates_domains = self.checking_duplicate_sources(self.literature_domains, self.max_count_domains) | ||
| references, ref_sequence = self.search_references(start_literature_par) | ||
| all_numbers = set(range(1, number_of_sources + 1)) | ||
| if len(references.symmetric_difference(all_numbers)) == 0: | ||
|
|
@@ -68,7 +71,7 @@ def check(self): | |
| elif ref_sequence: | ||
| result_str += f"Источники должны нумероваться в порядке упоминания в тексте. Неправильные последовательности: {'; '.join(num for num in ref_sequence)}" | ||
| return answer(False, result_str) | ||
| elif not duplicates: | ||
| elif not duplicates_ref and not duplicates_domains: | ||
| return answer(True, f"Пройдена!") | ||
| elif len(references.difference(all_numbers)): | ||
| if len(all_numbers.difference(references)) == 0: | ||
|
|
@@ -82,14 +85,24 @@ def check(self): | |
| all_numbers -= references | ||
| result_str = f'Упомянуты не все источники из списка.<br>Список источников без упоминания: {", ".join(str(num) for num in sorted(all_numbers))} <br> Всего источников: {number_of_sources}<br><br>' | ||
|
|
||
| if duplicates: | ||
| if duplicates_ref: | ||
| message = '' | ||
| for duplicate in duplicates: | ||
| for duplicate in duplicates_ref: | ||
| message += f'<li>Источники с номерами: {duplicate[1]} ссылаются на один и тот же источник: {duplicate[0]};</li>\n' | ||
| result_str += (f'Повторяющиеся источники:' | ||
| f'<ul>\n' | ||
| f'{message}' | ||
| f'</ul>') | ||
|
|
||
| if duplicates_domains: | ||
| message = '' | ||
| for duplicate in duplicates_domains: | ||
| message += f'<li>Источники с номерами: {duplicate[1]} ссылаются на один и тот же домен: {duplicate[0]};</li>\n' | ||
| result_str += (f'Повторяющиеся домены, максимум на один домен могут ссылаться не более {self.max_count_domains} источников:' | ||
| f'<ul>\n' | ||
| f'{message}' | ||
| f'</ul>') | ||
|
|
||
| result_str += ''' | ||
| Если возникли проблемы, попробуйте сделать следующее: | ||
| <ul> | ||
|
|
@@ -139,14 +152,14 @@ def add_references(self, k, prev_ref, array_of_references, ref_sequence): | |
| array_of_references.add(k) | ||
| return prev_ref | ||
|
|
||
| def checking_duplicate_sources(self) -> list: | ||
| """Функция нахождения дубликатов в источниках""" | ||
| counter = Counter([text.lower() for text in self.literature_reference_text]) | ||
| def checking_duplicate_sources(self, sources: list[str], max_count: int) -> list: | ||
| """Функция нахождения дубликатов в определенных позициях""" | ||
| counter = Counter([text.lower() for text in sources]) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. замените лист на генератор - он будет работать быстрее и меньше займет памяти |
||
|
|
||
| duplicates = [] | ||
| for text, count in counter.items(): | ||
| if count >= 2: | ||
| positions_duplicates = [i + 1 for i, text_in_ref in enumerate(self.literature_reference_text) if text == text_in_ref.lower()] | ||
| if count >= max_count and text != '': | ||
|
Comment on lines
160
to
+161
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Чтобы не делать на каждом шаге итерации сравнение |
||
| positions_duplicates = [i + 1 for i, text_in_ref in enumerate(sources) if text == text_in_ref.lower()] | ||
|
|
||
| if positions_duplicates: | ||
| duplicates.append(( | ||
|
|
@@ -166,6 +179,15 @@ def find_start_paragraph(self): | |
| break | ||
| return start_index | ||
|
|
||
| def find_domains(self, sources: str): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. смысла в этой функции как методе класса - 0 (он ещё и меняет состояние объекта, хотя вроде как должен просто найти домены) - проще regexp использовать в |
||
| pattern = r'(?:https?|ftp)?://([^/\s?#]+)' | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Вынесите в поле класса |
||
| match = re.search(pattern, sources, re.IGNORECASE) | ||
| if match and match.group(1): | ||
| self.literature_domains.append(match.group(1)) | ||
| else: | ||
| self.literature_domains.append('') #чтобы можно было определить номер | ||
|
Comment on lines
+185
to
+188
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Сократите до 1 строки (тернарный оператор)
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Чтобы определить номер, достаточно хранить его - вместе с доменом, иначе у вас есть список из 100 пустых строк (=много источников), потому что доменов среди нет нет |
||
|
|
||
|
|
||
| def count_sources_vkr(self, header): | ||
| literature_counter = 0 | ||
| if not len(header["child"]): | ||
|
|
@@ -176,6 +198,7 @@ def count_sources_vkr(self, header): | |
| # if re.search(f"дата обращения", child["text"].lower()): | ||
| literature_counter += 1 | ||
| self.literature_reference_text.append(child["text"]) | ||
| self.find_domains(child["text"]) | ||
| return literature_counter | ||
|
|
||
| def count_sources(self): | ||
|
|
@@ -200,6 +223,7 @@ def count_sources(self): | |
| if re.match(f"{literature_counter + 1}.", one_page[ind]): | ||
| literature_counter += 1 | ||
| self.literature_reference_text.append(one_page[ind]) | ||
| self.find_domains(one_page[ind]) | ||
| return literature_counter | ||
|
|
||
| def search_literature_start_pdf(self): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
добавьте для max_count значение по умолчанию (= исходная логика с дубликатами источников)