From 8417e03e7151b381e379630e0b4ca3f390f89719 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Tue, 17 Jan 2023 10:50:31 -0300 Subject: [PATCH 01/89] Refactoring scheduler html --- main/templates/main/scheduler.html | 424 ------------------ .../main/scheduler/_calendar_daily.html | 2 + .../main/scheduler/_calendar_monthly.html | 2 + .../main/scheduler/_calendar_weekly.html | 2 + .../main/scheduler/_calendar_yearly.html | 75 ++++ main/templates/main/scheduler/_header.html | 19 + .../_modal_personalized_repetition.html | 182 ++++++++ .../scheduler/_modal_scheduling_detail.html | 13 + .../main/scheduler/_modal_set_scheduling.html | 61 +++ .../main/scheduler/_notification.html | 9 + main/templates/main/scheduler/index.html | 44 ++ main/views.py | 2 +- 12 files changed, 410 insertions(+), 425 deletions(-) delete mode 100644 main/templates/main/scheduler.html create mode 100644 main/templates/main/scheduler/_calendar_daily.html create mode 100644 main/templates/main/scheduler/_calendar_monthly.html create mode 100644 main/templates/main/scheduler/_calendar_weekly.html create mode 100644 main/templates/main/scheduler/_calendar_yearly.html create mode 100644 main/templates/main/scheduler/_header.html create mode 100644 main/templates/main/scheduler/_modal_personalized_repetition.html create mode 100644 main/templates/main/scheduler/_modal_scheduling_detail.html create mode 100644 main/templates/main/scheduler/_modal_set_scheduling.html create mode 100644 main/templates/main/scheduler/_notification.html create mode 100644 main/templates/main/scheduler/index.html diff --git a/main/templates/main/scheduler.html b/main/templates/main/scheduler.html deleted file mode 100644 index a83d3a18..00000000 --- a/main/templates/main/scheduler.html +++ /dev/null @@ -1,424 +0,0 @@ -{% extends 'main/base.html' %} - -{% load static %} - -{% block title %} -Agendamento de coletas -{% endblock %} - -{% block content %} - -
-
-
-
- - - -
-

1 de agosto de 2022

-
- - -
-
- -
-
-
- Notificação - -
-
- -
-
- -
-
-

Janeiro

- -
- -
-
-
-

Fevereiro

-
- -
-
-
-

Março

-
- -
-
-
-

Abril

-
- -
-
-
-

Maio

-
- -
-
-
-

Junho

-
- -
-
-
-

Julho

-
- -
-
-
-

Agosto

-
- -
-
-
-

Setembro

-
- -
-
-
-

Outubro

-
- -
-
-
-

Novembro

-
- -
-
-
-

Dezembro

-
- -
-
-
- -
-
- -
-
- -
-

-
-
- Qua. -
-
- 3 -
-
-

-
    -
  • -
    - - 3 AM - -
    -
    -
    -
    -
    -
  • -
-
- - -
- - - - - - - - - -{% endblock %} - -{% block js %} - - - - -{% endblock %} \ No newline at end of file diff --git a/main/templates/main/scheduler/_calendar_daily.html b/main/templates/main/scheduler/_calendar_daily.html new file mode 100644 index 00000000..2e818986 --- /dev/null +++ b/main/templates/main/scheduler/_calendar_daily.html @@ -0,0 +1,2 @@ +
+
\ No newline at end of file diff --git a/main/templates/main/scheduler/_calendar_monthly.html b/main/templates/main/scheduler/_calendar_monthly.html new file mode 100644 index 00000000..b209065d --- /dev/null +++ b/main/templates/main/scheduler/_calendar_monthly.html @@ -0,0 +1,2 @@ +
+
\ No newline at end of file diff --git a/main/templates/main/scheduler/_calendar_weekly.html b/main/templates/main/scheduler/_calendar_weekly.html new file mode 100644 index 00000000..a3371d84 --- /dev/null +++ b/main/templates/main/scheduler/_calendar_weekly.html @@ -0,0 +1,2 @@ +
+
\ No newline at end of file diff --git a/main/templates/main/scheduler/_calendar_yearly.html b/main/templates/main/scheduler/_calendar_yearly.html new file mode 100644 index 00000000..e28d727b --- /dev/null +++ b/main/templates/main/scheduler/_calendar_yearly.html @@ -0,0 +1,75 @@ +
+
+

Janeiro

+ +
+ +
+
+
+

Fevereiro

+
+ +
+
+
+

Março

+
+ +
+
+
+

Abril

+
+ +
+
+
+

Maio

+
+ +
+
+
+

Junho

+
+ +
+
+
+

Julho

+
+ +
+
+
+

Agosto

+
+ +
+
+
+

Setembro

+
+ +
+
+
+

Outubro

+
+ +
+
+
+

Novembro

+
+ +
+
+
+

Dezembro

+
+ +
+
+
\ No newline at end of file diff --git a/main/templates/main/scheduler/_header.html b/main/templates/main/scheduler/_header.html new file mode 100644 index 00000000..f7732c96 --- /dev/null +++ b/main/templates/main/scheduler/_header.html @@ -0,0 +1,19 @@ +
+
+
+
+ + + +
+

1 de agosto de 2022

+
+ + +
+
\ No newline at end of file diff --git a/main/templates/main/scheduler/_modal_personalized_repetition.html b/main/templates/main/scheduler/_modal_personalized_repetition.html new file mode 100644 index 00000000..70d9b191 --- /dev/null +++ b/main/templates/main/scheduler/_modal_personalized_repetition.html @@ -0,0 +1,182 @@ + \ No newline at end of file diff --git a/main/templates/main/scheduler/_modal_scheduling_detail.html b/main/templates/main/scheduler/_modal_scheduling_detail.html new file mode 100644 index 00000000..c62cf9fb --- /dev/null +++ b/main/templates/main/scheduler/_modal_scheduling_detail.html @@ -0,0 +1,13 @@ + \ No newline at end of file diff --git a/main/templates/main/scheduler/_modal_set_scheduling.html b/main/templates/main/scheduler/_modal_set_scheduling.html new file mode 100644 index 00000000..60852466 --- /dev/null +++ b/main/templates/main/scheduler/_modal_set_scheduling.html @@ -0,0 +1,61 @@ + \ No newline at end of file diff --git a/main/templates/main/scheduler/_notification.html b/main/templates/main/scheduler/_notification.html new file mode 100644 index 00000000..ec2eb8ae --- /dev/null +++ b/main/templates/main/scheduler/_notification.html @@ -0,0 +1,9 @@ +
+
+ Notificação + +
+
+ +
+
\ No newline at end of file diff --git a/main/templates/main/scheduler/index.html b/main/templates/main/scheduler/index.html new file mode 100644 index 00000000..80c49e33 --- /dev/null +++ b/main/templates/main/scheduler/index.html @@ -0,0 +1,44 @@ +{% extends 'main/base.html' %} + +{% load static %} + +{% block title %} +Agendamento de coletas +{% endblock %} + +{% block content %} + +{% include "main/scheduler/_header.html" %} + +
+ {% include "main/scheduler/_notification.html" %} + + + {% include "main/scheduler/_calendar_yearly.html" %} + + {% include "main/scheduler/_calendar_monthly.html" %} + + {% include "main/scheduler/_calendar_weekly.html" %}> + + {% include "main/scheduler/_calendar_daily.html" %}> + + +
+ +{% include "main/scheduler/_modal_set_scheduling.html" %}> + +{% include "main/scheduler/_modal_scheduling_detail.html" %}> + +{% include "main/scheduler/_modal_personalized_repetition.html" %}> + +{% endblock %} + +{% block js %} + + + + +{% endblock %} \ No newline at end of file diff --git a/main/views.py b/main/views.py index 18450e8a..0d34b2e0 100644 --- a/main/views.py +++ b/main/views.py @@ -922,7 +922,7 @@ def scheduler(request): context = { 'crawl_requests': crawl_requests } - return render(request, 'main/scheduler.html', context) + return render(request, 'main/scheduler/index.html', context) # API ######## From ebe867d470522a29c12c22d143a963364587f244 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Mon, 6 Feb 2023 15:28:22 -0300 Subject: [PATCH 02/89] Enable interface resources --- main/staticfiles/js/scheduler/scheduler.js | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/main/staticfiles/js/scheduler/scheduler.js b/main/staticfiles/js/scheduler/scheduler.js index 0a56b0da..a333b354 100644 --- a/main/staticfiles/js/scheduler/scheduler.js +++ b/main/staticfiles/js/scheduler/scheduler.js @@ -460,14 +460,12 @@ function show_task_detail(task_id) { - From bbcfbc547e74d39d708e9e2f4187778c9d21857d Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Mon, 27 Feb 2023 21:59:27 -0300 Subject: [PATCH 21/89] Interface updated for new scheduling config --- main/staticfiles/js/scheduler/scheduler.js | 18 +++++ .../main/scheduler/_modal_set_scheduling.html | 78 +++++++++++++------ main/views.py | 4 +- 3 files changed, 77 insertions(+), 23 deletions(-) diff --git a/main/staticfiles/js/scheduler/scheduler.js b/main/staticfiles/js/scheduler/scheduler.js index c9132edc..a620ee61 100644 --- a/main/staticfiles/js/scheduler/scheduler.js +++ b/main/staticfiles/js/scheduler/scheduler.js @@ -360,6 +360,14 @@ function valid_new_scheduling() { return; } + let now = new Date(); + let start_date = new Date(scheduling_task.start_date); + + if (start_date < now) { + alert('O horário de início da coleta deve ser maior que o horário atual!'); + return; + } + if (task_being_edited) { services.save_updated_scheduling(task_being_edited); @@ -813,5 +821,15 @@ $(document).ready(function () { scheduling_task.crawl_request = parseInt($(this).val()); }); + $('#scheduling-timezone').on('change', function() { + if (task_being_edited) + scheduling_task = task_being_edited; + + else + scheduling_task = new_scheduling_config; + + scheduling_task.timezone = $(this).val(); + }); + update_calendar_mode('daily'); }); diff --git a/main/templates/main/scheduler/_modal_set_scheduling.html b/main/templates/main/scheduler/_modal_set_scheduling.html index 60852466..b5bc4e33 100644 --- a/main/templates/main/scheduler/_modal_set_scheduling.html +++ b/main/templates/main/scheduler/_modal_set_scheduling.html @@ -7,7 +7,10 @@ \ No newline at end of file diff --git a/main/templates/main/scheduler/_modal_all_schedulings.html b/main/templates/main/scheduler/_modal_all_schedulings.html new file mode 100644 index 00000000..adb00e26 --- /dev/null +++ b/main/templates/main/scheduler/_modal_all_schedulings.html @@ -0,0 +1,82 @@ + \ No newline at end of file diff --git a/main/templates/main/scheduler/_modal_set_scheduling.html b/main/templates/main/scheduler/_modal_set_scheduling.html index b5bc4e33..f93be643 100644 --- a/main/templates/main/scheduler/_modal_set_scheduling.html +++ b/main/templates/main/scheduler/_modal_set_scheduling.html @@ -50,8 +50,7 @@ +
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ColetorDataHoraStatusAções
Coletor 101/01/202100:00Em execução - - -
Coletor 201/01/202100:00Em execução - - -
Coletor 301/01/202100:00Em execução - -
+
+
    +
+
diff --git a/main/templates/main/scheduler/index.html b/main/templates/main/scheduler/index.html index 96bc988b..43b778d0 100644 --- a/main/templates/main/scheduler/index.html +++ b/main/templates/main/scheduler/index.html @@ -10,6 +10,16 @@ {% include "main/scheduler/_header.html" %} + +
{% include "main/scheduler/_notification.html" %} @@ -39,6 +49,10 @@ {% endblock %} {% block js %} + + diff --git a/main/views.py b/main/views.py index 73acb390..9a704590 100644 --- a/main/views.py +++ b/main/views.py @@ -920,9 +920,12 @@ def load_iframe(request): def scheduler(request): crawl_requests = CrawlRequest.objects.all() + tasks_serialized = TaskSerializer(Task.objects.all(), many=True) + context = { 'crawl_requests': crawl_requests, 'timezones': pytz.common_timezones, + 'tasks': json.dumps(tasks_serialized.data) } return render(request, 'main/scheduler/index.html', context) From 72a0e08369aaf222c9df70a3475ceaf75502a40d Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Wed, 1 Mar 2023 16:49:45 -0300 Subject: [PATCH 24/89] Open scheduling detail via scheduling list --- main/staticfiles/js/scheduler/scheduler.js | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/main/staticfiles/js/scheduler/scheduler.js b/main/staticfiles/js/scheduler/scheduler.js index 7d770dc5..0e7a8301 100644 --- a/main/staticfiles/js/scheduler/scheduler.js +++ b/main/staticfiles/js/scheduler/scheduler.js @@ -404,15 +404,13 @@ function task_runtime_to_date(runtime) { } function show_task_detail(task_id) { - let task = tasks[task_id]; + $('#allScheduling').modal('hide'); + let task = TASKS[task_id]; let cur_date = new Date() - let start_date = task_runtime_to_date(task.start_date); - let repeat_info = ''; - let since = cur_date > start_date ? 'Desde de ' : 'A partir de '; since += `${start_date.getDate()} de ${MONTHS[start_date.getMonth()]} de ${start_date.getFullYear()}.`; From 93f43915c7f2f0fbdcf9486e2b1b1cff2e4bbd87 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Thu, 2 Mar 2023 12:31:34 -0300 Subject: [PATCH 25/89] Refactoring: removing unecessary variables --- main/staticfiles/js/scheduler/calendar.js | 27 +++--- main/staticfiles/js/scheduler/scheduler.js | 95 +++++++++++++------ main/staticfiles/js/scheduler/services.js | 4 +- .../scheduler/_modal_scheduling_detail.html | 5 +- 4 files changed, 91 insertions(+), 40 deletions(-) diff --git a/main/staticfiles/js/scheduler/calendar.js b/main/staticfiles/js/scheduler/calendar.js index e540c5f0..2a03aace 100644 --- a/main/staticfiles/js/scheduler/calendar.js +++ b/main/staticfiles/js/scheduler/calendar.js @@ -292,23 +292,28 @@ calendar.daily.get_daily_tasks = function () { let key = start_date = end_date = calendar.get_formated_date(day.getDate(), day.getMonth() + 1, day.getFullYear()); let tasks_of_day = services.get_tasks_in_interval(start_date, end_date); - - // global variable - tasks = {}; - if (key in tasks_of_day) - services.update_tasks(tasks_of_day[key]); - + // junta todos as listas de tasks do dia em uma única lista de tasks, garantindo que não haverá tasks repetidas + + let all_tasks_of_day = new Set(); + for (let hour in tasks_of_day) { + for (let task of tasks_of_day[hour]) { + all_tasks_of_day.add(task); + } + } + this.tasks = {}; for (let hour = 0;hour<24;hour++) this.tasks[String(hour).padStart(2, '0')] = []; - let task_runtime; - for (let task_id in tasks) { - task_runtime = tasks[task_id].start_date; + let task_runtime, task; + + // iterates over all tasks of the day + for (let task_id of all_tasks_of_day) { + task = services.get_task(task_id); + task_runtime = task.start_date; key = calendar.get_hour_from_str_datetime(task_runtime); - this.tasks[key].push(tasks[task_id]); + this.tasks[key].push(task); } - } calendar.daily.show = function () { diff --git a/main/staticfiles/js/scheduler/scheduler.js b/main/staticfiles/js/scheduler/scheduler.js index 0e7a8301..f282ba71 100644 --- a/main/staticfiles/js/scheduler/scheduler.js +++ b/main/staticfiles/js/scheduler/scheduler.js @@ -47,13 +47,18 @@ var calendar_mode = null; //daily, weekly, monthly or yearly var tasks; var task_being_edited = null; +// the detail modal can be opened by the modal that lists all schedulings. +// This variable is used to know if the modal of all schedulings should be +// opened when the detail modal is closed +var open_all_schedulings = false; + function open_set_scheduling(creating_task = true) { if (creating_task) { init_default_options(); } - $('#setSchedulingModal').modal('show'); + $('#setSchedulingModal').modal({ backdrop: 'static', keyboard: false }, 'show'); } function hide_set_scheduling() { @@ -65,7 +70,7 @@ function hide_set_scheduling() { function open_personalized_crawler_repeat() { $('#setSchedulingModal').modal('hide'); - $('#personalizedCrawlerRepetion').modal('show'); + $('#personalizedCrawlerRepetion').modal({ backdrop: 'static', keyboard: false }, 'show'); } function show_days_of_week_repeat_options() { @@ -160,8 +165,6 @@ function init_default_options() { function repeat_to_text(repeat) { let s = ''; - console.warn(repeat); - switch (repeat.mode) { case 'daily': if (repeat.interval == 1) @@ -281,7 +284,7 @@ function update_repeat_info() { function close_personalized_repeat_modal() { $('#personalizedCrawlerRepetion').modal('hide'); - $('#setSchedulingModal').modal('show'); + $('#setSchedulingModal').modal({ backdrop: 'static', keyboard: false }, 'show'); } function update_calendar_mode(mode) { @@ -403,10 +406,12 @@ function task_runtime_to_date(runtime) { } -function show_task_detail(task_id) { +function show_task_detail(task_id, open_all_schedulings_after_close = false) { $('#allScheduling').modal('hide'); - + + open_all_schedulings = open_all_schedulings_after_close; let task = TASKS[task_id]; + let cur_date = new Date() let start_date = task_runtime_to_date(task.start_date); let repeat_info = ''; @@ -486,26 +491,40 @@ function show_task_detail(task_id) { $('#detailSchedulingContent').html(task_detail_html); - $('#detailScheduling').modal('show'); + $('#detailScheduling').modal({ backdrop: 'static', keyboard: false }, 'show'); } function edit_scheduling_task(task_id) { $('#detailScheduling').modal('hide'); - if (!tasks[task_id]) { + let task = TASKS[task_id]; + if (!task) { console.error('Invalid task id!'); return; } + let personalized_repeat = task.personalized_repeat; + if (!personalized_repeat) { + personalized_repeat = { + mode: 'daily', + interval: 1, + data: null, + finish: { + mode: 'never', + value: null + } + }; + } + task_being_edited = { - id: tasks[task_id].id, - crawl_request: tasks[task_id].crawl_request, - start_date: tasks[task_id].start_date, - timezone: tasks[task_id].timezone, - crawler_queue_behavior: tasks[task_id].crawler_queue_behavior, - repeat_mode: tasks[task_id].repeat_mode, - personalized_repeat: tasks[task_id].personalized_repeat + id: task.id, + crawl_request: task.crawl_request, + start_date: task.start_date, + timezone: task.timezone, + crawler_queue_behavior: task.crawler_queue_behavior, + repeat_mode: task.repeat_mode, + personalized_repeat: personalized_repeat }; fill_set_scheduling(task_id); @@ -524,11 +543,13 @@ function delete_schedule_task(task_id) { } function fill_set_scheduling(task_id) { - let task = tasks[task_id]; + let task = TASKS[task_id]; if (!task) return; + console.log('>>', task); + let start_date = task.start_date.substr(0, 16); $(`#crawl-selector option[value='${task.crawl_request}']`).attr('selected', 'selected'); @@ -857,7 +878,7 @@ function create_task_item(task) { title="Visualizar agendamento" style="width: 1.75rem; height: 1.75rem;" class="scheduling-item text-muted rounded-circle bg-white border-0 d-flex align-items-center justify-content-center mr-2" - onclick="show_task_detail(${task.id})"> + onclick="show_task_detail(${task.id}, true)"> @@ -873,36 +894,56 @@ function create_task_item(task) { ` } -function update_task_list(tasks) { +function fill_task_list() { let task_list = $('#task-list'); let task_items = []; - for (let i = 0; i < TASKS.length; i++) - task_items.push(create_task_item(TASKS[i])); + for (let task_id in TASKS) + task_items.push(create_task_item(TASKS[task_id])); task_list.html(task_items.join('')); } function show_all_scheduling() { - $('#allScheduling').modal('show'); + $('#allScheduling').modal({ backdrop: 'static', keyboard: false }, 'show'); +} + +function close_detail_scheduling_modal() { + $('#detailScheduling').modal('hide'); + + if (open_all_schedulings) { + open_all_schedulings = false; + show_all_scheduling(); + } } $(document).ready(function () { - update_task_list(TASKS); + // transforma a lista de tarefas em um objeto onde o campo id é a chave + TASKS = TASKS.reduce((obj, item) => { + obj[item.id] = item; + return obj; + }, {}); + + fill_task_list(); // quando o usuário está digitando em search-task, filtra a lista de tarefas e atualiza a lista - $('#search-task').on('keyup', function () { let search = $(this).val().toLowerCase(); let task_list = $('#task-list'); let task_items = []; - for (let i = 0; i < TASKS.length; i++) { - if (TASKS[i].crawler_name.toLowerCase().includes(search)) - task_items.push(create_task_item(TASKS[i])); + + // itera por cada objeto no objeto TASKS + for (let task_id in TASKS) { + let task = TASKS[task_id]; + + // se o nome do crawler contém a string de busca, adiciona o item na lista + if (task.crawler_name.toLowerCase().includes(search)) + task_items.push(create_task_item(task)); } + if (task_items.length == 0) task_items.push(`
  • +
    +
    +

    ${task.crawler_name}

    +
    +
    +
  • + `; +} + +function show_more_schedulings(tasks_not_shown, day, hour) { + let splited_day = day.split('-'); + + let curr_day = new Date(parseInt(splited_day[2]), + parseInt(splited_day[1]) - 1, + parseInt(splited_day[0]), + hour); + + // formatar data para o formato dia da semana, dia/mês/anos, a partir das horas e minutos + let formatted_date = curr_day.toLocaleDateString('pt-BR', { + weekday: 'long', + day: 'numeric', + month: 'long', + year: 'numeric' + }); + + let from_hour = curr_day.getHours(); + let to_hour = from_hour + 1; + + if (to_hour >= 24) + to_hour = 0; + + formatted_date += `, ${from_hour}h - ${to_hour}h`; + + // first letter uppercase + formatted_date = formatted_date.charAt(0).toUpperCase() + formatted_date.slice(1); + + $('#showMoreSchedulings .modal-title').html(formatted_date); + + let task_list = $('#showMoreSchedulings .modal-body ul'); + + let task_items = [], task; + for (let i in tasks_not_shown) { + task_id = tasks_not_shown[i]; + task = TASKS[task_id]; + task_items.push(get_more_scheduling_li_html(task, curr_day)); + } + + task_list.html(task_items.join('')); + + open_show_more_schedulings_modal(); +} + $(document).ready(function () { // transforma a lista de tarefas em um objeto onde o campo id é a chave TASKS = TASKS.reduce((obj, item) => { diff --git a/main/templates/main/scheduler/_modal_show_more.html b/main/templates/main/scheduler/_modal_show_more.html new file mode 100644 index 00000000..9be8f043 --- /dev/null +++ b/main/templates/main/scheduler/_modal_show_more.html @@ -0,0 +1,25 @@ + \ No newline at end of file diff --git a/main/templates/main/scheduler/index.html b/main/templates/main/scheduler/index.html index 5385a8be..4dc7b0cc 100644 --- a/main/templates/main/scheduler/index.html +++ b/main/templates/main/scheduler/index.html @@ -49,6 +49,8 @@ {% include "main/scheduler/_modal_all_schedulings.html" %} +{% include "main/scheduler/_modal_show_more.html" %} + {% endblock %} {% block js %} From 753e4260474012ee0b0316c5a067052deba25858 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Mon, 20 Mar 2023 16:06:06 -0300 Subject: [PATCH 42/89] Improves interface --- main/staticfiles/js/scheduler/calendar.js | 2 +- main/staticfiles/js/scheduler/scheduler.js | 30 ++++++++++++++++++++++ main/staticfiles/js/scheduler/services.js | 13 ++++++++-- 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/main/staticfiles/js/scheduler/calendar.js b/main/staticfiles/js/scheduler/calendar.js index 89d4ff44..bec04e05 100644 --- a/main/staticfiles/js/scheduler/calendar.js +++ b/main/staticfiles/js/scheduler/calendar.js @@ -608,7 +608,7 @@ calendar.daily.show = function () { onclick="show_task_detail(${task.id})" title="${title}" class="${bg_color} text-white rounded-pill px-2 ml-2 mt-2"> - ${task.crawler_name} +

    ${task.crawler_name}

    `); diff --git a/main/staticfiles/js/scheduler/scheduler.js b/main/staticfiles/js/scheduler/scheduler.js index 01588553..784ec46e 100644 --- a/main/staticfiles/js/scheduler/scheduler.js +++ b/main/staticfiles/js/scheduler/scheduler.js @@ -49,6 +49,7 @@ var calendar_mode = null; //daily, weekly, monthly or yearly var task_being_edited = null; var modal_open_callback; +var excluded_tasks = []; function open_set_scheduling(creating_task = true) { @@ -1043,6 +1044,7 @@ function get_more_scheduling_li_html(task, curr_day) { return `
  • @@ -1089,6 +1091,11 @@ function show_more_schedulings(tasks_not_shown, day, hour) { let task_items = [], task; for (let i in tasks_not_shown) { task_id = tasks_not_shown[i]; + + // ckeck if task is in excluded tasks + if (excluded_tasks.includes(task_id)) + continue; + task = TASKS[task_id]; task_items.push(get_more_scheduling_li_html(task, curr_day)); } @@ -1098,6 +1105,29 @@ function show_more_schedulings(tasks_not_shown, day, hour) { open_show_more_schedulings_modal(); } +function update_view() { + switch (calendar_mode) { + case 'daily': + calendar.daily.show(); + break; + + case 'weekly': + calendar.weekly.show(); + break; + + case 'monthly': + calendar.monthly.show(); + break; + + case 'yearly': + calendar.yearly.show(); + break; + + default: + break; + } +} + $(document).ready(function () { // transforma a lista de tarefas em um objeto onde o campo id é a chave TASKS = TASKS.reduce((obj, item) => { diff --git a/main/staticfiles/js/scheduler/services.js b/main/staticfiles/js/scheduler/services.js index a772bb09..07f27922 100644 --- a/main/staticfiles/js/scheduler/services.js +++ b/main/staticfiles/js/scheduler/services.js @@ -22,6 +22,8 @@ services.save_new_scheduling = function (new_scheduling_config) { TASKS[data.id] = data; fill_task_list(); + update_view(); + }, error: function (data) { alert('Houve um erro no agendamento, tente novamente!'); @@ -72,15 +74,22 @@ services.update_tasks = function (tarks_ids) { task_id = tarks_ids[i]; tasks[task_id] = this.get_task(task_id); } + + } services.delete_task = function(task_id) { + excluded_tasks.push(task_id); + + // Remove task from view + $(`#more-showing-task-${task_id}`).remove(); + $.ajax({ url: `/api/scheduler/tasks/${task_id}`, type: 'delete', async: false, success: function (data) { - calendar.daily.today(); + update_view(); }, error: function (data) { console.error(data.responseText); @@ -105,7 +114,7 @@ services.save_updated_scheduling = function (task_being_edited) { $('#toast').toast('show'); TASKS[task_id] = data; - + update_view(); }, error: function (data) { console.error(data.responseText); From fa4f1a04c8562920fe5aa3df02bcd0ffe0eec4e8 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Tue, 21 Mar 2023 13:38:34 -0300 Subject: [PATCH 43/89] Monthly viewer for schedulings --- main/staticfiles/css/style.css | 18 +- main/staticfiles/js/scheduler/calendar.js | 224 ++++++++++++++++-- main/staticfiles/js/scheduler/scheduler.js | 11 +- main/staticfiles/js/scheduler/services.js | 1 - .../main/scheduler/_calendar_monthly.html | 2 +- 5 files changed, 219 insertions(+), 37 deletions(-) diff --git a/main/staticfiles/css/style.css b/main/staticfiles/css/style.css index 8b458326..6a61a9a0 100644 --- a/main/staticfiles/css/style.css +++ b/main/staticfiles/css/style.css @@ -378,15 +378,19 @@ a.close, a.add_form { column-gap: 15px; row-gap: 15px; height: calc(100vh - 148px); - grid-template-columns: 1fr 1fr 1fr 1fr 1fr 1fr 1fr; - grid-template-rows: 0fr 1fr 1fr 1fr 1fr 1fr 1fr; + grid-template-columns: repeat(7, 1fr); + grid-template-rows: 0fr repeat(6, 1fr); } .calendar-cell { display: inline-grid; +} +.calendar-cell-monthly { + min-height: 1em; } + .no-current-month { color: lightgray; } @@ -394,11 +398,19 @@ a.close, a.add_form { .calendar-cell-content { display: flex; justify-content: space-around; - align-items: center; + align-items: start; height: 100%; flex-direction: column; } +.calendar-cell-monthly-content { + overflow-y: auto; + /* display: flex; */ + /* justify-content: space-around; */ + /* align-items: center; */ + /* flex-direction: column; */ +} + .calendar-yearly { display: none; column-gap: 25px; diff --git a/main/staticfiles/js/scheduler/calendar.js b/main/staticfiles/js/scheduler/calendar.js index bec04e05..f4e09a25 100644 --- a/main/staticfiles/js/scheduler/calendar.js +++ b/main/staticfiles/js/scheduler/calendar.js @@ -54,6 +54,7 @@ var FLEX_CENTER = 'd-flex justify-content-center align-items-center'; var JANUARY = 0; var DECEMBER = 11; +var MAX_TASKS = 3; var calendar = {}; @@ -83,38 +84,194 @@ calendar.weekly.active_start_day = calendar.weekly.curr_start_day = new Date(cal calendar.monthly.active_month = calendar.monthly.curr_month = new Date(calendar.curr_date.getFullYear(), calendar.curr_date.getMonth()); calendar.yearly.active_year = calendar.yearly.curr_year = calendar.curr_date.getFullYear(); +calendar.monthly.tasks = {}; -calendar.fill_month = function (container, year, month) { +calendar.monthly.get_tasks = function (start_date, end_date) { + let formatted_start_date = calendar.get_formated_date(start_date.getDate(), start_date.getMonth() + 1, start_date.getFullYear()); + let formatted_end_date = calendar.get_formated_date(end_date.getDate(), end_date.getMonth() + 1, end_date.getFullYear()); + + let tasks = services.get_tasks_in_interval(formatted_start_date, formatted_end_date); + + this.tasks = tasks; +} + +calendar.monthly.get_schedulings_html = function (schedulings, curr_date) { + if (!schedulings) + return ''; + + let schedulings_html = '', task; + + let title = ''; + let bg_color = ''; + let opacity = ''; + + let num_tasks = schedulings.length; + + if (num_tasks > MAX_TASKS) { + for (let i = 0; i < MAX_TASKS - 1; i++) { + task = TASKS[schedulings[i]]; + + switch (task.crawler_queue_behavior) { + case 'wait_on_first_queue_position': + bg_color = 'bg-warning'; + break; + + case 'run_immediately': + bg_color = 'bg-danger'; + break; + + default: + bg_color = 'bg-primary'; + break; + } + + [title, opacity] = get_task_title_and_opacity(task, curr_date); + + schedulings_html += `
    +

    + ${task.crawler_name} +

    +
    `; + } + + let tasks_not_shown = []; + for (let i = MAX_TASKS - 1; i < num_tasks; i++) + tasks_not_shown.push(TASKS[schedulings[i]].id); + + let formatted_date = calendar.get_formated_date(curr_date.getDate(), curr_date.getMonth() + 1, curr_date.getFullYear()); + + schedulings_html += ` +
    +

    +${num_tasks - MAX_TASKS + 1} outras

    +
    `; + + } else { + for (let i = 0; i < schedulings.length; i++) { + task = TASKS[schedulings[i]]; + + switch (task.crawler_queue_behavior) { + case 'wait_on_first_queue_position': + bg_color = 'bg-warning'; + break; + + case 'run_immediately': + bg_color = 'bg-danger'; + break; + + default: + bg_color = 'bg-primary'; + break; + } + + [title, opacity] = get_task_title_and_opacity(task, curr_date) + + schedulings_html += `
    +

    + ${task.crawler_name} +

    +
    `; + } + } + + return schedulings_html; +} + +calendar.monthly.fill_cells = function (container, year, month) { let first_day = new Date(year, month, 1); let last_day = new Date(year, month + 1, 0); - + let weekday_month_start = first_day.getDay(); let num_days_previous_month = (new Date(year, month, 0)).getDate(); + + let previous_month = num_days_previous_month - weekday_month_start + 1 > 0 ? month - 1 : month; - let calendar_cells = []; - let i; + let start_date = new Date(year, previous_month, num_days_previous_month - weekday_month_start + 1); + let end_date; - for (i = 0; i < WEEKDAYS.length; i++) - calendar_cells.push(`
    ${WEEKDAYS[i]}
    `); + if (6 - last_day.getDay() == 0) + end_date = last_day; + + else + end_date = new Date(year, month + 1, 6 - last_day.getDay()); + + this.get_tasks(start_date, end_date); + + let calendar_cells = []; + for (i = 0; i < WEEKDAYS.length; i++) { + calendar_cells.push(`
    +

    + ${WEEKDAYS[i]} +

    +
    `); + } i = num_days_previous_month - weekday_month_start + 1; + let date, formatted_date, schedulings_html; + + let bg_color = ''; for (i; i <= num_days_previous_month; i++) { - calendar_cells.push(`
    - ${i} + date = new Date(year, previous_month, i); + formatted_date = calendar.get_formated_date(date.getDate(), date.getMonth() + 1, date.getFullYear()); + schedulings_html = this.get_schedulings_html(this.tasks[formatted_date], date); + + bg_color = date.getDay() % 2 ? 'bg-light' : 'bg-white'; + + calendar_cells.push(`
    +
    +
    +

    ${i}

    +
    +
    + ${schedulings_html} +
    +
    `); } let is_curr_day_css = ''; for (i = 1; i <= last_day.getDate(); i++) { - if (this.curr_date.getFullYear() == year && this.curr_date.getMonth() == month && this.curr_date.getDate() == i) + if (calendar.curr_date.getFullYear() == year && calendar.curr_date.getMonth() == month && calendar.curr_date.getDate() == i) is_curr_day_css = `class="bg-primary rounded-circle text-white text-center border font-weight-bold" style="width: 1.8em; height: 2.3em; padding-top: 2px;"`; - calendar_cells.push(`
    -
    - ${i} -
    -
    - + date = new Date(year, month, i); + + bg_color = date.getDay() % 2 ? 'bg-light' : 'bg-white'; + + formatted_date = calendar.get_formated_date(date.getDate(), date.getMonth() + 1, date.getFullYear()); + schedulings_html = this.get_schedulings_html(this.tasks[formatted_date], date); + + calendar_cells.push(`
    +
    +
    +

    ${i}

    +
    +
    + ${schedulings_html} +
    `); @@ -123,16 +280,32 @@ calendar.fill_month = function (container, year, month) { let diff_until_saturday = 6 - last_day.getDay(); - for (i = 1; i <= diff_until_saturday; i++) - calendar_cells.push(`
    ${i}
    `); + for (i = 1; i <= diff_until_saturday; i++) { + date = new Date(year, month + 1, i); + + formatted_date = calendar.get_formated_date(date.getDate(), date.getMonth() + 1, date.getFullYear()); + schedulings_html = this.get_schedulings_html(this.tasks[formatted_date], date); + bg_color = date.getDay() % 2 ? 'bg-light' : 'bg-white'; + + calendar_cells.push(`
    +
    +
    +

    ${i}

    +
    +
    + ${schedulings_html} +
    +
    +
    `); + } container.empty(); container.html(calendar_cells.join('\n')) } calendar.monthly.show = function () { - calendar.fill_month(this.container, this.active_month.getFullYear(), this.active_month.getMonth()); + this.fill_cells(this.container, this.active_month.getFullYear(), this.active_month.getMonth()); this.container.css('display', 'grid'); calendar.date_info.text(`${MONTHS[this.active_month.getMonth()]} de ${this.active_month.getFullYear()}`); } @@ -258,11 +431,10 @@ calendar.weekly.get_datetime_tasks = function (week_day, hour) { let tasks = this.tasks[day][hour]; let task, task_repr, task_reprs = [], bg_color; - let max_tasks = 3; let num_tasks = tasks.length; - if (num_tasks > max_tasks) { - for (let i = 0; i < max_tasks - 1; i++) { + if (num_tasks > MAX_TASKS) { + for (let i = 0; i < MAX_TASKS - 1; i++) { task = tasks[i]; switch (task.crawler_queue_behavior) { @@ -297,16 +469,18 @@ calendar.weekly.get_datetime_tasks = function (week_day, hour) { } let tasks_not_shown = []; - for (let i = max_tasks - 1; i < num_tasks; i++) + for (let i = MAX_TASKS - 1; i < num_tasks; i++) tasks_not_shown.push(tasks[i].id); + // transforma a data no formato dia/mês/ano + task_repr = `
    -

    +${num_tasks - 2} outras

    +

    +${num_tasks - MAX_TASKS + 1} outras

    `; task_reprs.push(task_repr); @@ -330,8 +504,7 @@ calendar.weekly.get_datetime_tasks = function (week_day, hour) { let [title, opacity] = get_task_title_and_opacity(task, curr_day); - task_repr = ` -
    = 24) to_hour = 0; diff --git a/main/staticfiles/js/scheduler/services.js b/main/staticfiles/js/scheduler/services.js index 07f27922..858a6925 100644 --- a/main/staticfiles/js/scheduler/services.js +++ b/main/staticfiles/js/scheduler/services.js @@ -39,7 +39,6 @@ services.get_tasks_in_interval = function (start_date, end_date) { async: false, success: function (data) { tasks_by_date = data; - console.log('Tasks received!', tasks_by_date); }, error: function (data) { console.error(data.responseText); diff --git a/main/templates/main/scheduler/_calendar_monthly.html b/main/templates/main/scheduler/_calendar_monthly.html index b209065d..ba896a16 100644 --- a/main/templates/main/scheduler/_calendar_monthly.html +++ b/main/templates/main/scheduler/_calendar_monthly.html @@ -1,2 +1,2 @@ -
    +
    \ No newline at end of file From 9120a43bc5a6961d9d8bd6a48af6f0162ec1c283 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Fri, 24 Mar 2023 12:15:06 -0300 Subject: [PATCH 44/89] View schedulings by year --- main/staticfiles/js/scheduler/calendar.js | 88 +++++++++++++++++-- main/staticfiles/js/scheduler/scheduler.js | 18 +++- .../main/scheduler/_calendar_yearly.html | 33 ++++--- 3 files changed, 117 insertions(+), 22 deletions(-) diff --git a/main/staticfiles/js/scheduler/calendar.js b/main/staticfiles/js/scheduler/calendar.js index f4e09a25..d5d4865b 100644 --- a/main/staticfiles/js/scheduler/calendar.js +++ b/main/staticfiles/js/scheduler/calendar.js @@ -153,7 +153,7 @@ calendar.monthly.get_schedulings_html = function (schedulings, curr_date) { schedulings_html += `

    +${num_tasks - MAX_TASKS + 1} outras

    `; @@ -251,10 +251,15 @@ calendar.monthly.fill_cells = function (container, year, month) {
    `); } - let is_curr_day_css = ''; + let custom_style = ''; for (i = 1; i <= last_day.getDate(); i++) { - if (calendar.curr_date.getFullYear() == year && calendar.curr_date.getMonth() == month && calendar.curr_date.getDate() == i) - is_curr_day_css = `class="bg-primary rounded-circle text-white text-center border font-weight-bold" style="width: 1.8em; height: 2.3em; padding-top: 2px;"`; + if (calendar.curr_date.getFullYear() == year + && calendar.curr_date.getMonth() == month + && calendar.curr_date.getDate() == i) + custom_style = `class="bg-primary rounded-circle text-white text-center border font-weight-bold" style="width: 1.8em; height: 1.9em; padding-top: 2px;"`; + else + custom_style = 'class="m-0 p-0 font-weight-bold"'; + date = new Date(year, month, i); @@ -266,7 +271,7 @@ calendar.monthly.fill_cells = function (container, year, month) { calendar_cells.push(`
    -

    ${i}

    +

    ${i}

    @@ -275,7 +280,6 @@ calendar.monthly.fill_cells = function (container, year, month) {
    `); - is_curr_day_css = ''; } let diff_until_saturday = 6 - last_day.getDay(); @@ -329,16 +333,84 @@ calendar.monthly.hide = function () { this.container.css('display', 'none'); } +calendar.show_tasks_of_day = function (date) { + let start_date = end_date = calendar.get_formated_date(date.getDate(), date.getMonth() + 1, date.getFullYear()); + + let tasks = services.get_tasks_in_interval(start_date, end_date); + let tasks_of_day = []; + + //checks if tasks has a key equals to start_date + if (tasks.hasOwnProperty(start_date)) + tasks_of_day = tasks[start_date]; + + show_more_schedulings(tasks_of_day, start_date, 0, 0); +} + +calendar.yearly.fill_month = function (container, year, month) { + let first_day = new Date(year, month, 1); + let last_day = new Date(year, month + 1, 0); + + let weekday_month_start = first_day.getDay(); + let num_days_previous_month = (new Date(year, month, 0)).getDate(); + + let calendar_cells = []; + let i; + + for (i = 0; i < WEEKDAYS.length; i++) + calendar_cells.push(`
    ${WEEKDAYS[i]}
    `); + + i = num_days_previous_month - weekday_month_start + 1; + for (i; i <= num_days_previous_month; i++) { + calendar_cells.push(`
    + ${i} +
    `); + } + + + + let is_curr_day_css = ''; + for (i = 1; i <= last_day.getDate(); i++) { + if (calendar.curr_date.getFullYear() == year && calendar.curr_date.getMonth() == month && calendar.curr_date.getDate() == i) + is_curr_day_css = `class="bg-primary rounded-circle text-white text-center border font-weight-bold" style="width: 1.75em; height: 1.9m; padding-top: 2px;"`; + + calendar_cells.push(`
    +
    + ${i} +
    +
    `); + + is_curr_day_css = ''; + } + + let diff_until_saturday = 6 - last_day.getDay(); + + for (i = 1; i <= diff_until_saturday; i++) + calendar_cells.push(`
    + ${i} +
    `); + + + container.empty(); + container.html(calendar_cells.join('\n')) +} + calendar.yearly.show = function () { let i, month_el, month; + for (i=0;i= 24) to_hour = 0; - formatted_date += `, ${from_hour}h - ${to_hour}h`; + if (from_hour != to_hour) + formatted_date += `, ${from_hour}h - ${to_hour}h`; // first letter uppercase formatted_date = formatted_date.charAt(0).toUpperCase() + formatted_date.slice(1); @@ -1087,6 +1088,21 @@ function show_more_schedulings(tasks_not_shown, date, from_hour, to_hour=null) { let task_list = $('#showMoreSchedulings .modal-body ul'); + if (tasks_not_shown.length == 0) { + + task_list.html(`
  • +
    + ¯\\_(ツ)_/¯ +

    Sem agendamentos para a data informada!

    +
    +
  • `); + + open_show_more_schedulings_modal(); + + return; + } + let task_items = [], task; for (let i in tasks_not_shown) { task_id = tasks_not_shown[i]; diff --git a/main/templates/main/scheduler/_calendar_yearly.html b/main/templates/main/scheduler/_calendar_yearly.html index e28d727b..ea7da4b5 100644 --- a/main/templates/main/scheduler/_calendar_yearly.html +++ b/main/templates/main/scheduler/_calendar_yearly.html @@ -1,72 +1,79 @@ -
    -
    + +
    +

    Janeiro

    -
    +

    Fevereiro

    -
    +

    Março

    -
    +

    Abril

    -
    +

    Maio

    -
    +

    Junho

    -
    +

    Julho

    -
    +

    Agosto

    -
    +

    Setembro

    -
    +

    Outubro

    -
    +

    Novembro

    -
    +

    Dezembro

    From 2801f0936522ec7a552246c94b7a8274954b84b9 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Fri, 31 Mar 2023 13:27:13 -0300 Subject: [PATCH 45/89] ORM to scheduler classes --- src/schedule/schedule/schedule.py | 5 + src/schedule/schedule/scheduler_config.py | 120 +++++++++++++++++----- src/schedule/schedule/utils.py | 2 +- 3 files changed, 99 insertions(+), 28 deletions(-) diff --git a/src/schedule/schedule/schedule.py b/src/schedule/schedule/schedule.py index c2e9613b..37bfc368 100644 --- a/src/schedule/schedule/schedule.py +++ b/src/schedule/schedule/schedule.py @@ -7,6 +7,11 @@ from schedule.utils import * from schedule.scheduler_config import SchedulerConfig +from sqlalchemy import create_engine, Column, Integer, String, DateTime, ARRAY +from sqlalchemy.orm import sessionmaker +from sqlalchemy.orm import declarative_base +from sqlalchemy.orm.session import Session + logger = logging.getLogger('scheduler') logger.setLevel(logging.DEBUG) diff --git a/src/schedule/schedule/scheduler_config.py b/src/schedule/schedule/scheduler_config.py index cc2ca5ae..a6625977 100644 --- a/src/schedule/schedule/scheduler_config.py +++ b/src/schedule/schedule/scheduler_config.py @@ -2,10 +2,31 @@ from typing import List, Optional, Union import pytz +import environ from typing_extensions import Literal, TypedDict -from schedule.constants import * -from schedule.utils import * +from constants import * +from utils import * + +from sqlalchemy import create_engine, Column, Integer, String, DateTime, ARRAY +from sqlalchemy.orm import sessionmaker +from sqlalchemy.orm import declarative_base +from sqlalchemy.orm.session import Session + +env = environ.Env( + POSTGRES_SCHEDULER_CONFIG_TABLE_NAME=(str, 'scheduler_config'), + POSTGRES_USER=(str, 'django'), + POSTGRES_PASSWORD=(str, 'c01_password'), + POSTGRES_HOST=(str, 'localhost'), + POSTGRES_PORT=(int, 5432), + POSTGRES_DB=(str, 'c01_prod'), +) + +Base = declarative_base() + +DB_URI = f'postgresql://{env("POSTGRES_USER")}:{env("POSTGRES_PASSWORD")}@{env("POSTGRES_HOST")}:{env("POSTGRES_PORT")}/{env("POSTGRES_DB")}' + +# DB_URI = 'postgresql://django:c01_password@localhost:5432/c01_prod' class Finish(TypedDict): '''Define qual parâmetro para parar de reagendar uma coleta, a saber: @@ -69,31 +90,44 @@ class SchedulerConfigValueError(SchedulerConfigError): class SchedulerConfigInvalidRepeatModeError(SchedulerConfigError): pass -class SchedulerConfig: - def __init__(self) -> None: - self.start_date: datetime.datetime = None - self.timezone = None +class SchedulerConfig(Base): + __tablename__ = env('POSTGRES_SCHEDULER_CONFIG_TABLE_NAME') - self.repeat_mode: str = NO_REPEAT_MODE - self.repeat_interval: int = 1 - - self.max_repeats: Optional[int] = None - self.max_datetime: Optional[datetime.datetime] = None + id = Column(Integer, primary_key=True) - # If repeat_mode == 'weekly', the days of week to run - self.weekdays_to_run: Optional[List[int]] = None + start_date: datetime.datetime = Column(DateTime, nullable=False) + timezone: str = Column(String, nullable=True) - # Can be day-x, first-weekday, last-weekday - self.monthly_repeat_mode: Optional[str] = None - - # If monthly_repeat_mode is day-x, the variable represents the day of month scheduled. - # However, if monthly_repeat_mode is first-weekday or last-weekday, the value in the - # variable is the first or last weekday of month scheduled, respectivelly. - self.monthly_day_x_ocurrence: Optional[int] = None + repeat_mode: str = Column(String, default=NO_REPEAT_MODE) + repeat_interval: int = Column(Integer, default=1) + + max_repeats: Optional[int] = Column(Integer, default=None) + max_datetime: Optional[datetime.datetime] = Column(DateTime, default=None) - self.monthly_day_x_ocurrence: Optional[int] = None - self.monthly_first_weekday: Optional[int] = None - self.monthly_last_weekday: Optional[int] = None + # If repeat_mode == 'weekly', the days of week to run + weekdays_to_run: Optional[List[int]] = Column(ARRAY(Integer), default=None) + + # Can be day-x, first-weekday, last-weekday + monthly_repeat_mode: Optional[str] = Column(String, default=None) + + # If monthly_repeat_mode is day-x, the variable represents the day of month scheduled. + # However, if monthly_repeat_mode is first-weekday or last-weekday, the value in the + # variable is the first or last weekday of month scheduled, respectivelly. + monthly_day_x_ocurrence: Optional[int] = Column(Integer, default=None) + + monthly_day_x_ocurrence: Optional[int] = Column(Integer, default=None) + monthly_first_weekday: Optional[int] = Column(Integer, default=None) + monthly_last_weekday: Optional[int] = Column(Integer, default=None) + + session = None + + def __init__(self, session: Session): + super().__init__() + self.session = session + + def save(self): + self.session.add(self) + self.session.commit() def first_run_date(self) -> datetime.datetime: start_date = self.start_date @@ -161,7 +195,7 @@ def first_run_date(self) -> datetime.datetime: else: raise SchedulerConfigError('Invalid repeat mode') - + def next_run_date(self, last_run_date: datetime.datetime) -> datetime.datetime: if self.repeat_mode == NO_REPEAT_MODE: return None @@ -203,7 +237,7 @@ def load_config(self, config_dict: SchedulerConfigDict) -> None: # We assume that the config_dict is valid. That is, it has been validated before # SchedulerConfig.valid_config(config_dict) - self.timezone = pytz.timezone(config_dict['timezone']) + self.timezone = config_dict['timezone'] self.start_date = decode_datetimestr(config_dict['start_date']) self.repeat_mode = config_dict['repeat_mode'] @@ -211,7 +245,8 @@ def load_config(self, config_dict: SchedulerConfigDict) -> None: self._parse_personalized_config(config_dict['personalized_repeat']) def now(self) -> datetime.datetime: - return datetime.datetime.now(self.timezone).replace(tzinfo=None) + timezone = pytz.timezone(self.timezone) + return datetime.datetime.now(timezone).replace(tzinfo=None) def _parse_personalized_config(self, config_dict: PersonalizedRepeat) -> None: self.repeat_mode = config_dict['mode'] @@ -393,4 +428,35 @@ def valid_config(config_dict: SchedulerConfigDict) -> None: if finish_date < now: raise SchedulerConfigValueError(f'When the field `mode` of `finish` of `personalized_repeat` is `{REPEAT_FINISH_BY_DATE}`, ' \ - f'the value of field `value` must be a datetime greater than now.') \ No newline at end of file + f'the value of field `value` must be a datetime greater than now.') + +if __name__ == '__main__': + engine = create_engine(DB_URI, echo=False) + Base.metadata.create_all(engine) + + Session = sessionmaker(bind=engine) + session = Session() + + config = { + 'start_date': '2023-03-08T13:14', + 'timezone': 'America/Sao_Paulo', + 'repeat_mode': 'personalized', + 'personalized_repeat': { + 'mode': 'weekly', + 'data': [0, 1, 2, 3, 4, 5, 6], + 'interval': 3 + } + } + + scheduler_config = SchedulerConfig(session) + scheduler_config.load_config(config) + + scheduler_config.save() + + # configs = session.query(SchedulerConfig).all() + + # for config in configs: + # print(config.start_date) + # print(config.timezone) + # print(config.repeat_mode) + # print(config.weekdays_to_run) \ No newline at end of file diff --git a/src/schedule/schedule/utils.py b/src/schedule/schedule/utils.py index 3c0ad4ad..fdacedf6 100644 --- a/src/schedule/schedule/utils.py +++ b/src/schedule/schedule/utils.py @@ -2,7 +2,7 @@ import datetime from typing import Optional -from schedule.constants import * +from constants import * def get_date(year: int, month: int, day: int, hour: int = 0, minute: int = 0, second: int = 0) -> datetime: From 720ecc1c3903e407ff61c1754799db60f743569b Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Wed, 19 Apr 2023 15:34:08 -0300 Subject: [PATCH 46/89] Scheduler lib with improviments --- src/schedule/schedule/__init__.py | 14 +- .../{scheduler_config.py => config.py} | 75 ++------ src/schedule/schedule/constants.py | 25 +++ src/schedule/schedule/function_wrapper.py | 22 +++ src/schedule/schedule/job.py | 116 ++++++++++++ src/schedule/schedule/schedule.py | 165 ------------------ src/schedule/schedule/scheduler.py | 108 ++++++++++++ src/schedule/schedule/utils.py | 8 +- src/schedule/setup.py | 2 +- 9 files changed, 295 insertions(+), 240 deletions(-) rename src/schedule/schedule/{scheduler_config.py => config.py} (92%) create mode 100644 src/schedule/schedule/function_wrapper.py create mode 100644 src/schedule/schedule/job.py delete mode 100644 src/schedule/schedule/schedule.py create mode 100644 src/schedule/schedule/scheduler.py diff --git a/src/schedule/schedule/__init__.py b/src/schedule/schedule/__init__.py index c1305194..269b71d7 100644 --- a/src/schedule/schedule/__init__.py +++ b/src/schedule/schedule/__init__.py @@ -1,11 +1,11 @@ from schedule.constants import * from schedule.utils import * -from schedule.schedule import Job, Scheduler -from schedule.scheduler_config import (SchedulerConfig, - SchedulerConfigDict, - PersonalizedRepeat, - MonthlyRepeatConf, - Finish) -from schedule.schedule import Job, Scheduler +from schedule.scheduler import Job, Scheduler +from schedule.config import (Config, + SchedulerConfigDict, + PersonalizedRepeat, + MonthlyRepeatConf, + Finish) +from schedule.scheduler import Job, Scheduler diff --git a/src/schedule/schedule/scheduler_config.py b/src/schedule/schedule/config.py similarity index 92% rename from src/schedule/schedule/scheduler_config.py rename to src/schedule/schedule/config.py index a6625977..f45e6369 100644 --- a/src/schedule/schedule/scheduler_config.py +++ b/src/schedule/schedule/config.py @@ -2,31 +2,12 @@ from typing import List, Optional, Union import pytz -import environ from typing_extensions import Literal, TypedDict -from constants import * -from utils import * +from schedule.constants import * +from schedule.utils import * -from sqlalchemy import create_engine, Column, Integer, String, DateTime, ARRAY -from sqlalchemy.orm import sessionmaker -from sqlalchemy.orm import declarative_base -from sqlalchemy.orm.session import Session - -env = environ.Env( - POSTGRES_SCHEDULER_CONFIG_TABLE_NAME=(str, 'scheduler_config'), - POSTGRES_USER=(str, 'django'), - POSTGRES_PASSWORD=(str, 'c01_password'), - POSTGRES_HOST=(str, 'localhost'), - POSTGRES_PORT=(int, 5432), - POSTGRES_DB=(str, 'c01_prod'), -) - -Base = declarative_base() - -DB_URI = f'postgresql://{env("POSTGRES_USER")}:{env("POSTGRES_PASSWORD")}@{env("POSTGRES_HOST")}:{env("POSTGRES_PORT")}/{env("POSTGRES_DB")}' - -# DB_URI = 'postgresql://django:c01_password@localhost:5432/c01_prod' +from sqlalchemy import Column, Integer, String, DateTime, ARRAY class Finish(TypedDict): '''Define qual parâmetro para parar de reagendar uma coleta, a saber: @@ -37,7 +18,6 @@ class Finish(TypedDict): mode: Literal['never', 'occurrence', 'date'] value: Union[None, int, str] - class MonthlyRepeatConf(TypedDict): ''' Caso a repetição personalizado seja por mês, o usuário pode escolher 3 tipos de agendamento mensal: - first-weekday: A coleta ocorre no primeiro dia (domingo, segunda, etc) da semana do mês, contado a partir de 0 - domingo. @@ -50,7 +30,6 @@ class MonthlyRepeatConf(TypedDict): # Se day-x, o dia do mês que a coleta deverá ocorrer. value: int - class PersonalizedRepeat(TypedDict): # Uma repetição personalizada pode ser por dia, semana, mês ou ano. mode: Literal['daily', 'weekly', 'monthly', 'yearly'] @@ -90,8 +69,8 @@ class SchedulerConfigValueError(SchedulerConfigError): class SchedulerConfigInvalidRepeatModeError(SchedulerConfigError): pass -class SchedulerConfig(Base): - __tablename__ = env('POSTGRES_SCHEDULER_CONFIG_TABLE_NAME') +class Config(SQL_ALCHEMY_BASE): + __tablename__ = ENV('POSTGRES_SCHED_CONFIG_TABLE_NAME') id = Column(Integer, primary_key=True) @@ -119,15 +98,14 @@ class SchedulerConfig(Base): monthly_first_weekday: Optional[int] = Column(Integer, default=None) monthly_last_weekday: Optional[int] = Column(Integer, default=None) - session = None - - def __init__(self, session: Session): + def __init__(self): super().__init__() - self.session = session + + self.repeat_interval = 1 def save(self): - self.session.add(self) - self.session.commit() + SQL_ALCHEMY_DB_SESSION.add(self) + SQL_ALCHEMY_DB_SESSION.commit() def first_run_date(self) -> datetime.datetime: start_date = self.start_date @@ -428,35 +406,4 @@ def valid_config(config_dict: SchedulerConfigDict) -> None: if finish_date < now: raise SchedulerConfigValueError(f'When the field `mode` of `finish` of `personalized_repeat` is `{REPEAT_FINISH_BY_DATE}`, ' \ - f'the value of field `value` must be a datetime greater than now.') - -if __name__ == '__main__': - engine = create_engine(DB_URI, echo=False) - Base.metadata.create_all(engine) - - Session = sessionmaker(bind=engine) - session = Session() - - config = { - 'start_date': '2023-03-08T13:14', - 'timezone': 'America/Sao_Paulo', - 'repeat_mode': 'personalized', - 'personalized_repeat': { - 'mode': 'weekly', - 'data': [0, 1, 2, 3, 4, 5, 6], - 'interval': 3 - } - } - - scheduler_config = SchedulerConfig(session) - scheduler_config.load_config(config) - - scheduler_config.save() - - # configs = session.query(SchedulerConfig).all() - - # for config in configs: - # print(config.start_date) - # print(config.timezone) - # print(config.repeat_mode) - # print(config.weekdays_to_run) \ No newline at end of file + f'the value of field `value` must be a datetime greater than now.') \ No newline at end of file diff --git a/src/schedule/schedule/constants.py b/src/schedule/schedule/constants.py index ecc79131..1f5cca47 100644 --- a/src/schedule/schedule/constants.py +++ b/src/schedule/schedule/constants.py @@ -1,3 +1,28 @@ +import environ + +from sqlalchemy import create_engine +from sqlalchemy.orm import declarative_base, sessionmaker + +ENV = environ.Env( + POSTGRES_SCHED_CONFIG_TABLE_NAME=(str, 'sched_config'), + POSTGRES_SCHED_JOB_TABLE_NAME=(str, 'sched_job'), + + POSTGRES_USER=(str, 'django'), + POSTGRES_PASSWORD=(str, 'c01_password'), + POSTGRES_HOST=(str, 'localhost'), + POSTGRES_PORT=(int, 5432), + POSTGRES_DB=(str, 'c01_prod'), +) + +# SQL ALCHEMY CONFIG + +SQL_ALCHEMY_BASE = declarative_base() +DB_URI = f'postgresql://{ENV("POSTGRES_USER")}:{ENV("POSTGRES_PASSWORD")}@{ENV("POSTGRES_HOST")}:{ENV("POSTGRES_PORT")}/{ENV("POSTGRES_DB")}' +SQL_ALCHEMY_ENGINE = create_engine(DB_URI, echo=False) +SQL_ALCHEMY_DB_SESSION = sessionmaker(bind=SQL_ALCHEMY_ENGINE)() + +# SCHEDULE CONFIG + NUM_DAYS_IN_WEEK = 7 NUM_MONTHS_IN_YEAR = 12 diff --git a/src/schedule/schedule/function_wrapper.py b/src/schedule/schedule/function_wrapper.py new file mode 100644 index 00000000..5525b0cb --- /dev/null +++ b/src/schedule/schedule/function_wrapper.py @@ -0,0 +1,22 @@ +from typing import Any, Callable, Dict, List, Any + +class FunctionWrapper: + def __init__(self, funct: Callable, *args, **kwargs): + self.funct: Callable = funct + self.args: List[Any] = list(args) + self.kwargs: Dict[str, Any] = kwargs + + def __call__(self) -> Any: + return self.funct(*self.args, **self.kwargs) + + def __repr__(self) -> str: + return f"" + + def __str__(self) -> str: + return f"FunctionWrapper (funct={self.funct}, args={self.args}, kwargs={self.kwargs})" + + def __eq__(self, other: "FunctionWrapper") -> bool: + return self.funct == other.funct and self.args == other.args and self.kwargs == other.kwargs + + def __hash__(self) -> int: + return hash((self.funct, tuple(self.args), frozenset(self.kwargs.items()))) \ No newline at end of file diff --git a/src/schedule/schedule/job.py b/src/schedule/schedule/job.py new file mode 100644 index 00000000..2c7f02fb --- /dev/null +++ b/src/schedule/schedule/job.py @@ -0,0 +1,116 @@ + +import logging +import datetime +from typing import Callable + +from sqlalchemy import Column, Integer, PickleType, DateTime, ARRAY, ForeignKey, Boolean +from sqlalchemy.orm import relationship + +from schedule.utils import * +from schedule.constants import * +from schedule.config import Config +from schedule.function_wrapper import FunctionWrapper + +logger = logging.getLogger('scheduler_job') +logger.setLevel(logging.DEBUG) + +class CancelJob(object): + """ + Can be returned from a job to unschedule itself. + """ + + pass + +class Job(SQL_ALCHEMY_BASE): + __tablename__ = ENV('POSTGRES_SCHED_JOB_TABLE_NAME') + + id = Column(Integer, primary_key=True) + + cancelled = Column(Boolean, default=False) + + sched_config_id = Column(Integer, ForeignKey('sched_config.id')) + sched_config = relationship('Config', backref='jobs', lazy=True, uselist=False) + + num_repeats = Column(Integer, default=0) + + last_run = Column(DateTime) + next_run = Column(DateTime) + + job_funct = Column(PickleType, default=None, nullable=False) + + def __init__(self, sched_config: Config) -> None: + self.sched_config: Config = sched_config + self.num_repeats: int = 0 + + def __lt__(self, other: 'Job') -> bool: + assert self.next_run is not None, "must run _schedule_next_run before" + assert other.next_run is not None, "must run _schedule_next_run before" + return self.next_run < other.next_run + + def __repr__(self) -> str: + return f"" + + def __str__(self) -> str: + return f"Job (id={self.id}, sched_config_id={self.sched_config.id}, num_repeats={self.num_repeats}, last_run={self.last_run}, next_run={self.next_run})" + + def do(self, job_func: Callable, *args, **kwargs): + self.job_funct = FunctionWrapper(job_func, *args, **kwargs) + self._schedule_first_run() + + def save(self): + self.sched_config.save() + + SQL_ALCHEMY_DB_SESSION.add(self) + SQL_ALCHEMY_DB_SESSION.commit() + + @property + def should_run(self) -> bool: + print(f'Job {self} should run? Currentime time: {self.sched_config.now()}') + + assert self.next_run is not None, 'must run _schedule_next_run before' + return self.sched_config.now() >= self.next_run + + def run(self): + if self._is_overdue(self.sched_config.now()): + logger.debug(f'Cancelling job {self}.\n\tReason: The job is overdue.') + return CancelJob + + logger.debug('Running job %s', self) + ret = self.job_funct() + + self.num_repeats += 1 + if self._achieved_max_repeats(): + logger.debug(f'Cancelling job {self}.\n\tReason: Max repeats achieved ({self.cancel_after_max_repeats})') + return CancelJob + + self.last_run = self.sched_config.now() + self._schedule_next_run() + + # The repeat_mode is no_repeat, so we cancel the job + if self.next_run is None: + logger.debug(f'Cancelling job {self}.\n\tReason: No more runs.') + return CancelJob + + if self._is_overdue(self.next_run): + logger.debug(f'Cancelling next job {self} run.\n\tReason: The job is overdue.') + return CancelJob + + return ret + + def cancel(self): + self.cancelled = True + self.save() + + def _schedule_first_run(self) -> None: + self.next_run = self.sched_config.first_run_date() + self.save() + + def _schedule_next_run(self) -> None: + self.next_run = self.sched_config.next_run_date(self.next_run) + self.save() + + def _is_overdue(self, when: datetime.datetime) -> bool: + return self.sched_config.max_datetime is not None and when > self.sched_config.max_datetime + + def _achieved_max_repeats(self) -> bool: + return self.sched_config.max_repeats is not None and self.num_repeats >= self.sched_config.max_repeats \ No newline at end of file diff --git a/src/schedule/schedule/schedule.py b/src/schedule/schedule/schedule.py deleted file mode 100644 index 37bfc368..00000000 --- a/src/schedule/schedule/schedule.py +++ /dev/null @@ -1,165 +0,0 @@ -import datetime -import functools -import logging -from typing import Callable, List, Optional - -from schedule.constants import * -from schedule.utils import * -from schedule.scheduler_config import SchedulerConfig - -from sqlalchemy import create_engine, Column, Integer, String, DateTime, ARRAY -from sqlalchemy.orm import sessionmaker -from sqlalchemy.orm import declarative_base -from sqlalchemy.orm.session import Session - -logger = logging.getLogger('scheduler') -logger.setLevel(logging.DEBUG) - -class ScheduleError(Exception): - """Base schedule exception""" - - pass - -class ScheduleValueError(ScheduleError): - """Base schedule value error""" - - pass - -class IntervalError(ScheduleValueError): - """An improper interval was used""" - - pass - -class CancelJob(object): - """ - Can be returned from a job to unschedule itself. - """ - - pass - -class Job: - def __init__(self, scheduler: 'Scheduler', scheduler_config: SchedulerConfig) -> None: - self.scheduler: Scheduler = scheduler # scheduler to register with - self.sched_config: SchedulerConfig = scheduler_config - - self.last_run: Optional[datetime.datetime] = None - self.next_run: Optional[datetime.datetime] = None - - self.num_repeats: int = 0 - - def do(self, job_func: Callable, *args, **kwargs): - """ - Specifies the job_func that should be called every time the - job runs. - - Any additional arguments are passed on to job_func when - the job runs. - - :param job_func: The function to be scheduled - :return: The invoked job instance - """ - self.job_func = functools.partial(job_func, *args, **kwargs) - functools.update_wrapper(self.job_func, job_func) - - self._schedule_first_run() - - if self.scheduler is None: - raise ScheduleError( - "Unable to a add job to schedule. " - "Job is not associated with an scheduler" - ) - - self.scheduler.jobs.append(self) - return self - - @property - def should_run(self) -> bool: - """ - :return: ``True`` if the job should be run now. - """ - assert self.next_run is not None, "must run _schedule_next_run before" - return self.sched_config.now() >= self.next_run - - def run(self): - """ - Run the job and immediately reschedule it. - If the job's deadline is reached (configured using .until()), the job is not - run and CancelJob is returned immediately. If the next scheduled run exceeds - the job's deadline, CancelJob is returned after the execution. In this latter - case CancelJob takes priority over any other returned value. - - :return: The return value returned by the `job_func`, or CancelJob if the job's - deadline is reached. - - """ - if self._is_overdue(self.sched_config.now()): - logger.debug(f"Cancelling job {self}.\n\tReason: The job is overdue.") - return CancelJob - - logger.debug("Running job %s", self) - ret = self.job_func() - - self.num_repeats += 1 - if self._achieved_max_repeats(): - logger.debug(f"Cancelling job {self}.\n\tReason: Max repeats achieved ({self.cancel_after_max_repeats})") - return CancelJob - - self.last_run = self.sched_config.now() - self._schedule_next_run() - - if self._is_overdue(self.next_run): - logger.debug(f"Cancelling next job {self} run.\n\tReason: The job is overdue.") - return CancelJob - - return ret - - def _schedule_first_run(self) -> None: - self.next_run = self.sched_config.first_run_date() - - def _schedule_next_run(self) -> None: - self.next_run = self.sched_config.next_run_date(self.next_run) - - def _is_overdue(self, when: datetime.datetime) -> bool: - return self.sched_config.max_datetime is not None and when > self.sched_config.max_datetime - - def _achieved_max_repeats(self) -> bool: - return self.sched_config.max_repeats is not None and self.num_repeats >= self.sched_config.max_repeats - -class Scheduler: - def __init__(self) -> None: - self.jobs: List[Job] = list() - - def run_pending(self) -> None: - """ - Run all jobs that are scheduled to run. - - Please note that it is *intended behavior that run_pending() - does not run missed jobs*. For example, if you've registered a job - that should run every minute and you only call run_pending() - in one hour increments then your job won't be run 60 times in - between but only once. - """ - runnable_jobs = (job for job in self.jobs if job.should_run) - for job in sorted(runnable_jobs): - self._run_job(job) - - def _run_job(self, job: "Job") -> None: - ret = job.run() - if isinstance(ret, CancelJob) or ret is CancelJob: - self.cancel_job(job) - - def schedule_from_config(self, config: SchedulerConfig) -> Job: - return Job(self, config) - - def cancel_job(self, job: Job) -> None: - """ - Delete a scheduled job. - - :param job: The job to be unscheduled - """ - try: - logger.debug('Cancelling job "%s"', job) - self.jobs.remove(job) - - except ValueError: - logger.debug('Cancelling not-scheduled job "%s"', job) \ No newline at end of file diff --git a/src/schedule/schedule/scheduler.py b/src/schedule/schedule/scheduler.py new file mode 100644 index 00000000..f95bd323 --- /dev/null +++ b/src/schedule/schedule/scheduler.py @@ -0,0 +1,108 @@ +import time +import logging +from typing import Callable, List + +from schedule.constants import * +from schedule.utils import * +from schedule.config import Config +from schedule.job import Job, CancelJob + +logger = logging.getLogger('scheduler') +logger.setLevel(logging.DEBUG) + +class ScheduleError(Exception): + """Base schedule exception""" + + pass + +class ScheduleValueError(ScheduleError): + """Base schedule value error""" + + pass + +class IntervalError(ScheduleValueError): + """An improper interval was used""" + + pass + +class Scheduler: + def __init__(self) -> None: + self.jobs: List[Job] = list() + + create_db_tables() + self.recover_jobs() + + def run_pending(self) -> None: + """ + Run all jobs that are scheduled to run. + + Please note that it is *intended behavior that run_pending() + does not run missed jobs*. For example, if you've registered a job + that should run every minute and you only call run_pending() + in one hour increments then your job won't be run 60 times in + between but only once. + """ + runnable_jobs = (job for job in self.jobs if job.should_run) + for job in sorted(runnable_jobs): + self._run_job(job) + + def _run_job(self, job: "Job") -> None: + ret = job.run() + if isinstance(ret, CancelJob) or ret is CancelJob: + self.cancel_job(job) + + def schedule_job(self, scheduler_config: Config, job_func: Callable, *job_args, **job_kwargs) -> Job: + """ + Add a job to the schedule. + + :param job: The job to be added + """ + new_job = Job(scheduler_config) + new_job.do(job_func, *job_args, **job_kwargs) + self.jobs.append(new_job) + + return new_job + + def cancel_job(self, job: Job) -> None: + """ + Delete a scheduled job. + + :param job: The job to be unscheduled + """ + try: + logger.debug('Cancelling job "%s"', job) + job.cancel() + self.jobs.remove(job) + + except ValueError: + logger.debug('Cancelling not-scheduled job "%s"', job) + + def recover_jobs(self) -> None: + """ + Recover jobs from the database if the job is not cancelled. + """ + + logger.debug('Recovering jobs') + self.jobs = SQL_ALCHEMY_DB_SESSION.query(Job).filter(Job.cancelled == False).all() + self.run_pending() + + def cancel_all_jobs(self) -> None: + """ + Clear all scheduled jobs. + """ + logger.debug('Cancelling all jobs') + + for job in self.jobs: + job.cancel() + + self.jobs.clear() + + def run_all(self, delay_seconds: int = 0) -> None: + """ + Run all jobs regardless if they are scheduled to run or not. + + :param delay_seconds: The delay in seconds between each job + """ + for job in self.jobs: + self._run_job(job) + time.sleep(delay_seconds) \ No newline at end of file diff --git a/src/schedule/schedule/utils.py b/src/schedule/schedule/utils.py index fdacedf6..cac9fe12 100644 --- a/src/schedule/schedule/utils.py +++ b/src/schedule/schedule/utils.py @@ -2,8 +2,7 @@ import datetime from typing import Optional -from constants import * - +from schedule.constants import * def get_date(year: int, month: int, day: int, hour: int = 0, minute: int = 0, second: int = 0) -> datetime: try: @@ -86,4 +85,7 @@ def apply_timezone(datetime_obj: datetime.datetime, timezone = None) -> datetime if timezone is None: return datetime_obj - return timezone.localize(datetime_obj).astimezone(timezone).replace(tzinfo=None) \ No newline at end of file + return timezone.localize(datetime_obj).astimezone(timezone).replace(tzinfo=None) + +def create_db_tables(): + SQL_ALCHEMY_BASE.metadata.create_all(bind=SQL_ALCHEMY_ENGINE) \ No newline at end of file diff --git a/src/schedule/setup.py b/src/schedule/setup.py index 43aa9806..4b549887 100644 --- a/src/schedule/setup.py +++ b/src/schedule/setup.py @@ -17,5 +17,5 @@ # In production we may want to use the psycopg2 package itself, I'm using # the psycopg2-binary package here to avoid problems with external # libraries - install_requires=[] + install_requires=['SQLAlchemy==2.0.7', 'pytz==2022.1'] ) From 9fdb886f9e14c164757a42a21c7584b6d555743c Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Wed, 19 Apr 2023 15:49:40 -0300 Subject: [PATCH 47/89] Update scheduler container for the new schedule lib --- scheduler/src/scheduler.py | 44 +++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/scheduler/src/scheduler.py b/scheduler/src/scheduler.py index 8fa5a8ee..8d073e14 100644 --- a/scheduler/src/scheduler.py +++ b/scheduler/src/scheduler.py @@ -3,6 +3,7 @@ import threading import ujson from schedule import Scheduler as Schedule +from schedule import Config as ScheduleConfig import requests from kafka import KafkaConsumer @@ -16,11 +17,6 @@ def run_crawler(crawler_id, action): SERVER_SESSION.get(settings.RUN_CRAWLER_URL + "/api/crawlers/{}/run?action={}".format(crawler_id, action)) print(f'[{datetime.now()}] [TC] Crawler {crawler_id} processed by schedule...') -def run_crawler_once(crawler_id, action): - SERVER_SESSION.get(settings.RUN_CRAWLER_URL + "/api/crawlers/{}/run?action={}".format(crawler_id, action)) - print(f'[{datetime.now()}] [TC] Crawler {crawler_id} processed by schedule...') - return scheduler.CancelJob - class Scheduler: def __init__(self, jobs): self.jobs = jobs @@ -47,31 +43,39 @@ def __run_task_consumer(self): for message in consumer: # try: - task_data = ujson.loads(message.value.decode('utf-8')) + data = ujson.loads(message.value.decode('utf-8')) print(f'[{datetime.now()}] [TC] {worker_name} Worker: Processing task data') - self.__process_task_data(task_data) + self.__process_task_data(data) # except Exception as e: # print(f'[{datetime.now()}] [TC] {worker_name} Worker: Error processing task data: "{e}"') - def _set_schedule_call_for_task(self, task_data): - params = [run_crawler, task_data["data"]["crawl_request"], task_data["data"]["crawler_queue_behavior"]] - job = self.scheduler.schedule_from_config(task_data["data"]).do(*params) - self.jobs[task_data["data"]["id"]] = job + def _set_schedule_call_for_task(self, config_dict, task_id, crawler_id, behavior): + config = ScheduleConfig() + config.load_config(config_dict) - def __process_task_data(self, task_data): + job = self.scheduler.schedule_job(config, run_crawler, crawler_id=crawler_id, action=behavior) + self.jobs[task_id] = job + + def __process_task_data(self, data): + action = data['action'] + config_dict = data['schedule_config'] + + task_id = data['task_data']['id'] + crawler_id = data['task_data']['crawl_request'] + behavior = data['task_data']['crawler_queue_behavior'] - if task_data["action"] == "cancel": - schedule.cancel_job(self.jobs[task_data["data"]["id"]]) + if action == "cancel": + self.scheduler.cancel_job(self.jobs[task_id]) - if task_data["action"] == "update": - schedule.cancel_job(self.jobs[task_data["data"]["id"]]) - self._set_schedule_call_for_task(task_data) + if action == "update": + self.scheduler.cancel_job(self.jobs[task_id]) + self._set_schedule_call_for_task(config_dict, task_id, crawler_id, behavior) - if task_data["action"] == "create": - self._set_schedule_call_for_task(task_data) + if action == "create": + self._set_schedule_call_for_task(config_dict, task_id, crawler_id, behavior) def __create_task_consumer(self): self.thread = threading.Thread(target=self.__run_task_consumer, daemon=True) @@ -80,8 +84,8 @@ def __create_task_consumer(self): def run(self): self.__create_task_consumer() while True: + self.scheduler.run_pending() sleep(1) - schedule.run_pending() if __name__ == '__main__': From ab2081d0584af5678c4a8aa4206b542b8b766efb Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Wed, 19 Apr 2023 15:57:05 -0300 Subject: [PATCH 48/89] Minor refactoring --- scheduler/src/scheduler.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/scheduler/src/scheduler.py b/scheduler/src/scheduler.py index 8d073e14..522ad399 100644 --- a/scheduler/src/scheduler.py +++ b/scheduler/src/scheduler.py @@ -13,6 +13,10 @@ SERVER_SESSION = requests.sessions.Session() +CANCEL_TASK = "cancel" +UPDATE_TASK = "update" +CREATE_TASK = "create" + def run_crawler(crawler_id, action): SERVER_SESSION.get(settings.RUN_CRAWLER_URL + "/api/crawlers/{}/run?action={}".format(crawler_id, action)) print(f'[{datetime.now()}] [TC] Crawler {crawler_id} processed by schedule...') @@ -67,14 +71,14 @@ def __process_task_data(self, data): crawler_id = data['task_data']['crawl_request'] behavior = data['task_data']['crawler_queue_behavior'] - if action == "cancel": + if action == CANCEL_TASK: self.scheduler.cancel_job(self.jobs[task_id]) - if action == "update": + if action == UPDATE_TASK: self.scheduler.cancel_job(self.jobs[task_id]) self._set_schedule_call_for_task(config_dict, task_id, crawler_id, behavior) - if action == "create": + if action == CREATE_TASK: self._set_schedule_call_for_task(config_dict, task_id, crawler_id, behavior) def __create_task_consumer(self): @@ -85,13 +89,4 @@ def run(self): self.__create_task_consumer() while True: self.scheduler.run_pending() - sleep(1) - - -if __name__ == '__main__': - jobs = {} - - # get initial jobs - - scheduler = Scheduler(jobs) - scheduler.run() + sleep(1) \ No newline at end of file From 3d10fd4f34f922dc57e75048c147e3f1e834e1e2 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Wed, 19 Apr 2023 16:07:57 -0300 Subject: [PATCH 49/89] Minor refactoring --- src/schedule/schedule/config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/schedule/schedule/config.py b/src/schedule/schedule/config.py index f45e6369..ae0f397d 100644 --- a/src/schedule/schedule/config.py +++ b/src/schedule/schedule/config.py @@ -98,10 +98,10 @@ class Config(SQL_ALCHEMY_BASE): monthly_first_weekday: Optional[int] = Column(Integer, default=None) monthly_last_weekday: Optional[int] = Column(Integer, default=None) - def __init__(self): + def __init__(self, config_dict: SchedulerConfigDict = None): super().__init__() - self.repeat_interval = 1 + self._load_config(config_dict) def save(self): SQL_ALCHEMY_DB_SESSION.add(self) @@ -211,7 +211,7 @@ def next_run_date(self, last_run_date: datetime.datetime) -> datetime.datetime: else: raise SchedulerConfigError('Invalid repeat mode') - def load_config(self, config_dict: SchedulerConfigDict) -> None: + def _load_config(self, config_dict: SchedulerConfigDict) -> None: # We assume that the config_dict is valid. That is, it has been validated before # SchedulerConfig.valid_config(config_dict) From ee46139a35f5fa59549edce3909e3aa2ae50ff8e Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Fri, 28 Apr 2023 09:09:09 -0300 Subject: [PATCH 50/89] New tests and other improviments to scheduler lib --- scheduler/src/scheduler.py | 4 +- src/schedule/README.md | 78 ++- src/schedule/schedule/__init__.py | 11 - src/schedule/schedule/config.py | 608 ++++++++++++------ src/schedule/schedule/config_dict.py | 91 +++ src/schedule/schedule/constants.py | 44 +- .../schedule/{utils.py => date_utils.py} | 125 +++- src/schedule/schedule/function_wrapper.py | 1 - src/schedule/schedule/job.py | 125 +++- src/schedule/schedule/schedule.py | 157 +++++ src/schedule/schedule/scheduler.py | 108 ---- src/schedule/setup.py | 12 +- src/schedule/tests/test_config.py | 219 +++++++ src/schedule/tests/test_date_utils.py | 2 +- src/schedule/tests/test_job.py | 141 +++- 15 files changed, 1341 insertions(+), 385 deletions(-) create mode 100644 src/schedule/schedule/config_dict.py rename src/schedule/schedule/{utils.py => date_utils.py} (54%) create mode 100644 src/schedule/schedule/schedule.py delete mode 100644 src/schedule/schedule/scheduler.py create mode 100644 src/schedule/tests/test_config.py diff --git a/scheduler/src/scheduler.py b/scheduler/src/scheduler.py index 522ad399..324af732 100644 --- a/scheduler/src/scheduler.py +++ b/scheduler/src/scheduler.py @@ -57,9 +57,7 @@ def __run_task_consumer(self): # print(f'[{datetime.now()}] [TC] {worker_name} Worker: Error processing task data: "{e}"') def _set_schedule_call_for_task(self, config_dict, task_id, crawler_id, behavior): - config = ScheduleConfig() - config.load_config(config_dict) - + config = ScheduleConfig(config_dict) job = self.scheduler.schedule_job(config, run_crawler, crawler_id=crawler_id, action=behavior) self.jobs[task_id] = job diff --git a/src/schedule/README.md b/src/schedule/README.md index ecc27b2d..eb380836 100644 --- a/src/schedule/README.md +++ b/src/schedule/README.md @@ -1 +1,77 @@ -# Scheduler +# scheduler + +This is a scheduler based on the project [schedule](https://github.com/dbader/schedule) with significant changes, including: + +- More flexible in scheduling, based in json; +- Capable of recover when the program is restarted or the system is rebooted; +- More robust in error handling. + +## Installation + +```bash + +``` + +## Configuration + + +> You only need to configure PostgreSQL if you want to recover the tasks when the system/program starts. + +**You can skip this step if will not intend use recovery resource.** + +In order to retrieve the tasks scheduled when the program/system is off, the scheduler needs to connect to a PostgreSQL database. + +> We cannot use SQLite because it does not support some data types that we need. + +If you do not have a PostgreSQL database, you can install it by following the instructions [here](https://www.postgresql.org/download/). + +Or you can use the docker image. If you have docker installed, you can pull the image by running: + +```bash + +docker pull postgres + +``` + +Then you can run the image by running: + +```bash +docker run --name postgres-sched -e POSTGRES_USER=sched_user -e POSTGRES_PASSWORD=sched_password -e POSTGRES_DB=sched_db -p 5432:5432 -d postgres +``` + +This will create a container named `postgres-sched` with a database named `sched_db` and a user named `sched_user` with password `sched_password`. The container will be listening on port `5432` on your host machine. + +## Usage + +> If you want to recover the tasks, you need to configure the database connection (see Configuration section). + +Is needed to create an instance of Scheduler, with the **optional** parameter `persist_tasks`. Set to `True` if you want to recover the tasks when the program/system is restarted. + +Moreover, if `persist_tasks` is set to `True`, is needed set the following parameters according with your database configuration: + +- db_host: The host of the database. Default: localhost; +- db_port: The port of the database. Default: 5432; +- db_name: The name of the database. Default: sched_db; +- db_user: The user of the database. Default: sched_user; +- db_password: The password of the database. Default: sched_password; + +### Setting a schedule + +The configuration of a schedule must be defined in a json, with the following attributes: +- `start_date` [required]: The date when the schedule will start. Valid formats are: + - `%Y-%m-%d %H:%M:%S`, + - `%d-%m-%Y %H:%M:%S`, + - `%Y-%m-%d %H:%M`, + - `%d-%m-%Y %H:%M`, + - `%Y-%m-%d`, + - `%d-%m-%Y`, +- `timezone` [required]: The timezone of the schedule. Valid timezones are all timezones supported by [pytz](https://pypi.org/project/pytz/); +- `repeat_mode`: The repeat mode of the schedule. Valid modes are: + - `no_repeat`: The schedule will run only once; + - `minutely`: The schedule will run every minute; + - `hourly`: The schedule will run every hour; + - `daily`: The schedule will run every day at the time defined in `start_date`; + - `weekly`: The schedule will run every week at the time defined in `start_date`; + - `monthly`: The schedule will run every month at the time defined in `start_date`; + - `yearly`: The schedule will run every year at the time defined in `start_date`; + diff --git a/src/schedule/schedule/__init__.py b/src/schedule/schedule/__init__.py index 269b71d7..e69de29b 100644 --- a/src/schedule/schedule/__init__.py +++ b/src/schedule/schedule/__init__.py @@ -1,11 +0,0 @@ -from schedule.constants import * -from schedule.utils import * -from schedule.scheduler import Job, Scheduler -from schedule.config import (Config, - SchedulerConfigDict, - PersonalizedRepeat, - MonthlyRepeatConf, - Finish) -from schedule.scheduler import Job, Scheduler - - diff --git a/src/schedule/schedule/config.py b/src/schedule/schedule/config.py index ae0f397d..da304ff2 100644 --- a/src/schedule/schedule/config.py +++ b/src/schedule/schedule/config.py @@ -1,72 +1,38 @@ import datetime -from typing import List, Optional, Union +from typing import List, Optional import pytz -from typing_extensions import Literal, TypedDict -from schedule.constants import * -from schedule.utils import * +from schedule.constants import (ENV, SQL_ALCHEMY_BASE, NO_REPEAT_MODE, + DAILY_REPEAT_MODE, WEEKLY_REPEAT_MODE, + MONTHLY_REPEAT_MODE, YEARLY_REPEAT_MODE, + MONTHLY_DAY_X_OCCURRENCE_TYPE, + MONTHLY_FIRST_WEEKDAY_OCCURRENCE_TYPE, + MONTHLY_LAST_WEEKDAY_OCCURRENCE_TYPE, + PERSONALIZED_REPEAT_MODE, + REPEAT_FINISH_BY_OCCURRENCES, REPEAT_FINISH_BY_DATE, + REQUIRED_FIELDS, VALID_DATETIME_FORMATS, + VALID_REPEAT_MODES, PERSONALIZED_REQUIRED_FIELDS, + VALID_PERSONALIZED_REPEAT_MODES, VALID_MONTHLY_REPEAT_MODES, + VALID_REPEAT_FINISH, HOURLY_REPEAT_MODE,MINUTELY_REPEAT_MODE, + VALID_BEHAVIOR_AFTER_SYSTEM_RESTART, + DEFAULT_BEHAVIOR_AFTER_SYSTEM_RESTART) + +from schedule import date_utils +from schedule.config_dict import ConfigDict, PersonalizedRepeat from sqlalchemy import Column, Integer, String, DateTime, ARRAY -class Finish(TypedDict): - '''Define qual parâmetro para parar de reagendar uma coleta, a saber: - - never: o coletor é reagendado para sempre. - - occurrence: o coletor é colocado para executar novamente vezes. - - date: O coletor é colocado para executar até a data - ''' - mode: Literal['never', 'occurrence', 'date'] - value: Union[None, int, str] - -class MonthlyRepeatConf(TypedDict): - ''' Caso a repetição personalizado seja por mês, o usuário pode escolher 3 tipos de agendamento mensal: - - first-weekday: A coleta ocorre no primeiro dia (domingo, segunda, etc) da semana do mês, contado a partir de 0 - domingo. - - last-weekday: A coleta ocorre no último dia (domingo, segunda, etc) da semana do mês, contado a partir de 0 - domingo. - - day-x: A coleta ocorre no dia x do mês. Se o mês não tiver o dia x, ocorrerá no último dia do mês. - ''' - mode: Literal['first-weekday', 'last-weekday', 'day-x'] - - # Se [first,last]-weekday, indica qual dia semana a coleta deverá ocorrer, contado a partir de 0 - domingo. - # Se day-x, o dia do mês que a coleta deverá ocorrer. - value: int - -class PersonalizedRepeat(TypedDict): - # Uma repetição personalizada pode ser por dia, semana, mês ou ano. - mode: Literal['daily', 'weekly', 'monthly', 'yearly'] - - # de quanto em quanto intervalo de tempo a coleta irá ocorrer - interval: int - - ''' Dados extras que dependem do tipo da repetição. A saber, se é: - - daily: additional_data receberá null - - weekly: additional_data será uma lista com dias da semana (iniciados em 0 - domingo) - para quais dias semana a coleta irá executar. - - monthly: Ver classe MonthlyRepetitionConf. - - yearly: additional_data receberá null - ''' - data: Union[None, List, MonthlyRepeatConf] - - # Define até quando o coletor deve ser reagendado. Ver classe Finish. - finish: Finish - -class SchedulerConfigDict(TypedDict): - start_date: str - timezone: str - - repeat_mode: Literal['no_repeat', 'daily', 'weekly', 'monthly', 'yearly', 'personalized'] - - personalized_repeat: Union[None, PersonalizedRepeat] - -class SchedulerConfigError(Exception): +class ConfigError(Exception): pass -class SchedulerConfigMissingFieldError(SchedulerConfigError): +class ConfigMissingFieldError(ConfigError): pass -class SchedulerConfigValueError(SchedulerConfigError): +class ConfigValueError(ConfigError): pass -class SchedulerConfigInvalidRepeatModeError(SchedulerConfigError): +class ConfigInvalidRepeatModeError(ConfigError): pass class Config(SQL_ALCHEMY_BASE): @@ -77,6 +43,9 @@ class Config(SQL_ALCHEMY_BASE): start_date: datetime.datetime = Column(DateTime, nullable=False) timezone: str = Column(String, nullable=True) + # 0: cancel task, 1: re-schedule task for next valid run, 2: execute task now + behavior_after_system_restart: int = Column(Integer, default=0) + repeat_mode: str = Column(String, default=NO_REPEAT_MODE) repeat_interval: int = Column(Integer, default=1) @@ -98,16 +67,42 @@ class Config(SQL_ALCHEMY_BASE): monthly_first_weekday: Optional[int] = Column(Integer, default=None) monthly_last_weekday: Optional[int] = Column(Integer, default=None) - def __init__(self, config_dict: SchedulerConfigDict = None): + def __init__(self, db_session=None): super().__init__() + + self.db_session = db_session self.repeat_interval = 1 - self._load_config(config_dict) + def __eq__(self, other): + return self.start_date == other.start_date and \ + self.timezone == other.timezone and \ + self.behavior_after_system_restart == other.behavior_after_system_restart and \ + self.repeat_mode == other.repeat_mode and \ + self.repeat_interval == other.repeat_interval and \ + self.max_repeats == other.max_repeats and \ + self.max_datetime == other.max_datetime and \ + self.weekdays_to_run == other.weekdays_to_run and \ + self.monthly_repeat_mode == other.monthly_repeat_mode and \ + self.monthly_day_x_ocurrence == other.monthly_day_x_ocurrence and \ + self.monthly_first_weekday == other.monthly_first_weekday and \ + self.monthly_last_weekday == other.monthly_last_weekday + def save(self): - SQL_ALCHEMY_DB_SESSION.add(self) - SQL_ALCHEMY_DB_SESSION.commit() + ''' + Saves the config to the database. + ''' + if self.db_session is None: + return + + self.db_session.add(self) + self.db_session.commit() def first_run_date(self) -> datetime.datetime: + ''' + Calculates the first run date based on the config. + + Returns: The first run date. + ''' start_date = self.start_date repeat_interval = self.repeat_interval @@ -119,6 +114,14 @@ def first_run_date(self) -> datetime.datetime: if self.repeat_mode == NO_REPEAT_MODE: return start_date + elif self.repeat_mode == MINUTELY_REPEAT_MODE: + # Must consider the hour of start date + raise NotImplementedError() + + if self.repeat_mode == HOURLY_REPEAT_MODE: + # Must consider the hour of start date + raise NotImplementedError() + elif self.repeat_mode == DAILY_REPEAT_MODE: return start_date if now < start_date else start_date + datetime.timedelta(days=repeat_interval) @@ -128,105 +131,137 @@ def first_run_date(self) -> datetime.datetime: if start_date_weekday in self.weekdays_to_run: return start_date - return weeks_next_execution_date(start_date, self.weekdays_to_run, repeat_interval) + return date_utils.weeks_next_execution_date(start_date, self.weekdays_to_run, repeat_interval) elif self.repeat_mode == MONTHLY_REPEAT_MODE: if self.monthly_repeat_mode == MONTHLY_DAY_X_OCCURRENCE_TYPE: if self.start_date.day <= self.monthly_day_x_ocurrence: return start_date.replace(day=self.monthly_day_x_ocurrence) - return month_next_execution_date(start_date, + return date_utils.month_next_execution_date(start_date, MONTHLY_DAY_X_OCCURRENCE_TYPE, day_x = self.monthly_day_x_ocurrence, interval=repeat_interval) elif self.monthly_repeat_mode == MONTHLY_FIRST_WEEKDAY_OCCURRENCE_TYPE: - first_weekday_start_date = get_first_weekday_date_of_month(self.monthly_first_weekday, + first_weekday_start_date = date_utils.get_first_weekday_date_of_month(self.monthly_first_weekday, start_date.year, start_date.month, start_date.hour, start_date.minute, start_date.second) - return first_weekday_start_date if first_weekday_start_date >= start_date else month_next_execution_date(start_date, + return first_weekday_start_date if first_weekday_start_date >= start_date else date_utils.month_next_execution_date(start_date, MONTHLY_FIRST_WEEKDAY_OCCURRENCE_TYPE, first_weekday_to_run=self.monthly_first_weekday, interval=repeat_interval) elif self.monthly_repeat_mode == MONTHLY_LAST_WEEKDAY_OCCURRENCE_TYPE: - last_weekday_start_date = get_last_weekday_date_of_month(self.monthly_last_weekday, + last_weekday_start_date = date_utils.get_last_weekday_date_of_month(self.monthly_last_weekday, start_date.year, start_date.month, start_date.hour, start_date.minute, start_date.second) - return last_weekday_start_date if last_weekday_start_date >= start_date else month_next_execution_date(start_date, + return last_weekday_start_date if last_weekday_start_date >= start_date else date_utils.month_next_execution_date(start_date, MONTHLY_LAST_WEEKDAY_OCCURRENCE_TYPE, last_weekday_to_run=self.monthly_last_weekday, interval=repeat_interval) else: - raise SchedulerConfigError('Invalid monthly repeat mode') + raise ConfigError('Invalid monthly repeat mode') elif self.repeat_mode == YEARLY_REPEAT_MODE: - return start_date if now <= start_date else year_next_execution_date(start_date, repeat_interval) + return start_date if now <= start_date else date_utils.year_next_execution_date(start_date, repeat_interval) else: - raise SchedulerConfigError('Invalid repeat mode') + raise ConfigError('Invalid repeat mode') def next_run_date(self, last_run_date: datetime.datetime) -> datetime.datetime: + ''' + Calculates the next run date based on the config. + + :param last_run_date: The last run date. + + :returns: The next run date. + ''' + if self.repeat_mode == NO_REPEAT_MODE: return None + elif self.repeat_mode == MINUTELY_REPEAT_MODE: + return last_run_date + datetime.timedelta(minutes=self.repeat_interval) + + if self.repeat_mode == HOURLY_REPEAT_MODE: + return last_run_date + datetime.timedelta(hours=self.repeat_interval) + elif self.repeat_mode == DAILY_REPEAT_MODE: return last_run_date + datetime.timedelta(days=self.repeat_interval) elif self.repeat_mode == WEEKLY_REPEAT_MODE: - return weeks_next_execution_date(last_run_date, self.weekdays_to_run, self.repeat_interval) + return date_utils.weeks_next_execution_date(last_run_date, self.weekdays_to_run, self.repeat_interval) elif self.repeat_mode == MONTHLY_REPEAT_MODE: if self.monthly_repeat_mode == MONTHLY_DAY_X_OCCURRENCE_TYPE: - return month_next_execution_date(last_run_date, + return date_utils.month_next_execution_date(last_run_date, MONTHLY_DAY_X_OCCURRENCE_TYPE, day_x = self.monthly_day_x_ocurrence, interval=self.repeat_interval) elif self.monthly_repeat_mode == MONTHLY_FIRST_WEEKDAY_OCCURRENCE_TYPE: - return month_next_execution_date(last_run_date, + return date_utils.month_next_execution_date(last_run_date, MONTHLY_FIRST_WEEKDAY_OCCURRENCE_TYPE, first_weekday_to_run=self.monthly_first_weekday, interval=self.repeat_interval) elif self.monthly_repeat_mode == MONTHLY_LAST_WEEKDAY_OCCURRENCE_TYPE: - return month_next_execution_date(last_run_date, + return date_utils.month_next_execution_date(last_run_date, MONTHLY_LAST_WEEKDAY_OCCURRENCE_TYPE, last_weekday_to_run=self.monthly_last_weekday, interval=self.repeat_interval) else: - raise SchedulerConfigError('Invalid monthly repeat mode') + raise ConfigError('Invalid monthly repeat mode') elif self.repeat_mode == YEARLY_REPEAT_MODE: - return year_next_execution_date(last_run_date, self.repeat_interval) + return date_utils.year_next_execution_date(last_run_date, self.repeat_interval) else: - raise SchedulerConfigError('Invalid repeat mode') + raise ConfigError('Invalid repeat mode') - def _load_config(self, config_dict: SchedulerConfigDict) -> None: - # We assume that the config_dict is valid. That is, it has been validated before - # SchedulerConfig.valid_config(config_dict) + def load_config(self, config_dict: ConfigDict) -> None: + ''' + Loads the config from a dictionary. + + :param config_dict: The config dictionary. + ''' + + Config.valid_config(config_dict) self.timezone = config_dict['timezone'] - self.start_date = decode_datetimestr(config_dict['start_date']) + self.start_date = date_utils.decode_datetimestr(config_dict['start_date']) self.repeat_mode = config_dict['repeat_mode'] + self.behavior_after_system_restart = config_dict.get('behavior_after_system_restart', DEFAULT_BEHAVIOR_AFTER_SYSTEM_RESTART) if config_dict['repeat_mode'] == PERSONALIZED_REPEAT_MODE: self._parse_personalized_config(config_dict['personalized_repeat']) def now(self) -> datetime.datetime: + ''' + Returns the current datetime. + + :returns: The current datetime. + ''' + timezone = pytz.timezone(self.timezone) return datetime.datetime.now(timezone).replace(tzinfo=None) def _parse_personalized_config(self, config_dict: PersonalizedRepeat) -> None: + ''' + Parses the personalized repeat config. + + :param config_dict: The config dictionary. + ''' + self.repeat_mode = config_dict['mode'] self.repeat_interval = config_dict['interval'] @@ -246,7 +281,7 @@ def _parse_personalized_config(self, config_dict: PersonalizedRepeat) -> None: self.monthly_last_weekday = config_dict['data']['value'] else: - raise SchedulerConfigInvalidRepeatModeError(f'The mode "{self.monthly_repeat_mode}" is invalid for monthly repeat mode!') + raise ConfigInvalidRepeatModeError(f'The mode "{self.monthly_repeat_mode}" is invalid for monthly repeat mode!') if 'finish' in config_dict and config_dict['finish'] is not None: finish_repeat_mode = config_dict['finish']['mode'] @@ -255,155 +290,332 @@ def _parse_personalized_config(self, config_dict: PersonalizedRepeat) -> None: self.max_repeats = config_dict['finish']['value'] elif finish_repeat_mode == REPEAT_FINISH_BY_DATE: - self.max_datetime = decode_datetimestr(config_dict['finish']['value']) + self.max_datetime = date_utils.decode_datetimestr(config_dict['finish']['value']) @staticmethod - def valid_config(config_dict: SchedulerConfigDict) -> None: + def _valid_required_fields(config_dict: ConfigDict): + ''' + Validates if the required fields are present in the config. + + Raises an exception if a required field is missing. + + :param config_dict: The config dictionary. + + :raises ConfigMissingFieldError: If a required field is missing. + ''' + config_fields = config_dict.keys() for req_field in REQUIRED_FIELDS: if req_field not in config_fields: - raise SchedulerConfigMissingFieldError(f'The field "{req_field}" if the config of schedule is missing!') + raise ConfigMissingFieldError(f'The field "{req_field}" if the config of schedule is missing!') + + @staticmethod + def _valid_start_date_and_timezone(config_dict: ConfigDict): + ''' + Validates if the start date and timezone are valid. + + Raises an exception if the start date is invalid or if the timezone is not valid. + + :param config_dict: The config dictionary. + + :raises ConfigValueError: If the start date is invalid. + :raises ConfigValueError: If the timezone is invalid. + ''' if config_dict['start_date'] is None: valid_formats = '\n\t- '.join(VALID_DATETIME_FORMATS) - raise SchedulerConfigValueError(f'The field `start_date` must be in one of the following formats: \n\t- {valid_formats}') - - start_date = decode_datetimestr(config_dict['start_date']) + raise ConfigValueError(f'The field `start_date` must be in one of the following formats: \n\t- {valid_formats}') + start_date = date_utils.decode_datetimestr(config_dict['start_date']) + if start_date is None: valid_formats = '\n\t- '.join(VALID_DATETIME_FORMATS) - raise SchedulerConfigValueError(f'The field `start_date` must be in one of the following formats: \n\t- {valid_formats}') + raise ConfigValueError(f'The field `start_date` must be in one of the following formats: \n\t- {valid_formats}') if config_dict['timezone'] not in pytz.all_timezones: - raise SchedulerConfigValueError(f'The timezone "{config_dict["timezone"]}" is not valid!') + raise ConfigValueError(f'The timezone "{config_dict["timezone"]}" is not valid!') timezone = pytz.timezone(config_dict['timezone']) now = datetime.datetime.now(timezone).replace(tzinfo=None) if start_date < now: - raise SchedulerConfigValueError('The start date for scheduling has passed.' \ + raise ConfigValueError('The start date for scheduling has passed.' \ f' Now is {now} and start date has been set to {start_date}!') + @staticmethod + def _valid_behavior_after_system_restart(config_dict: ConfigDict): + ''' + Validates if the behavior after system restart is valid. + + Raises an exception if the behavior after system restart is invalid. + + :param config_dict: The config dictionary. + + :raises ConfigValueError: If the behavior after system restart is invalid. + ''' + + behavior_after_system_restart = config_dict.get('behavior_after_system_restart') + if behavior_after_system_restart is not None and behavior_after_system_restart not in VALID_BEHAVIOR_AFTER_SYSTEM_RESTART: + valid_formats = '\n\t- '.join(VALID_BEHAVIOR_AFTER_SYSTEM_RESTART) + raise ConfigValueError(f'The field `behavior_after_system_restart` must be in one of the following values: \n\t- {valid_formats}') + + @staticmethod + def _valid_repeat_mode(config_dict: ConfigDict) -> bool: + ''' + Validates if the repeat mode is valid. + + Raises an exception if the repeat mode is invalid. + + :param config_dict: The config dictionary. + + :raises ConfigInvalidRepeatModeError: If the repeat mode is invalid. + ''' + repeat_mode = config_dict['repeat_mode'] if repeat_mode not in VALID_REPEAT_MODES: valid_repeat_modes = ', '.join(VALID_REPEAT_MODES) - raise SchedulerConfigInvalidRepeatModeError(f'The valid repeats modes are: {valid_repeat_modes}. `{repeat_mode}` is not included!') + raise ConfigInvalidRepeatModeError(f'The valid repeats modes are: {valid_repeat_modes}. `{repeat_mode}` is not included!') - if repeat_mode == PERSONALIZED_REPEAT_MODE: - if type(config_dict['personalized_repeat']) is not dict: - personalized_required_fields = ', '.join(PERSONALIZED_REQUIRED_FIELDS) - raise SchedulerConfigValueError('If repeat mode is personalized, the field `personalized_repeat`' \ - f' must be a dict with the following fields: {personalized_required_fields}.') + @staticmethod + def _valid_personalized_required_fields(config_dict: ConfigDict): + ''' + Validates if the required fields for personalized repeat mode are present in the config. - personalized_available_fields = config_dict['personalized_repeat'].keys() + Raises an exception if a required field is missing. - for req_field in PERSONALIZED_REQUIRED_FIELDS: - if req_field not in personalized_available_fields: - raise SchedulerConfigMissingFieldError(f'The field `{req_field}` of `personalized_repeat` is missing!') + :param config_dict: The config dictionary. - personalized_repeat = config_dict['personalized_repeat']['mode'] + :raises ConfigMissingFieldError: If a required field is missing. + :raises ConfigValueError: If the repeat mode is invalid. + ''' + + if type(config_dict['personalized_repeat']) is not dict: + personalized_required_fields = ', '.join(PERSONALIZED_REQUIRED_FIELDS) + raise ConfigValueError('If repeat mode is personalized, the field `personalized_repeat`' \ + f' must be a dict with the following fields: {personalized_required_fields}.') - if personalized_repeat not in VALID_PERSONALIZED_REPEAT_MODES: - valid_repeat_modes = ', '.join(VALID_PERSONALIZED_REPEAT_MODES) - raise SchedulerConfigInvalidRepeatModeError(f'The valid repeats modes for `personalized_repeat` are: {valid_repeat_modes}. `{repeat_mode}` is not included!') + personalized_available_fields = config_dict['personalized_repeat'].keys() - personalized_interval = config_dict['personalized_repeat']['interval'] - if type(personalized_interval) is not int: - raise SchedulerConfigValueError(f'The repeat interval for `personalized_repeat` must be `int`, not `{type(personalized_interval)}`!') - - if personalized_interval <= 0: - raise SchedulerConfigValueError(f'The repeat interval for `personalized_repeat` must be a integer greater than 0!') - - personalized_data = config_dict['personalized_repeat']['data'] - if type(personalized_data) not in (type(None), list, dict): - raise SchedulerConfigValueError(f'The field `data` of `personalized_repeat` must be: None, a list or a dict.') - - personalized_repeat_mode = config_dict['personalized_repeat']['mode'] + for req_field in PERSONALIZED_REQUIRED_FIELDS: + if req_field not in personalized_available_fields: + raise ConfigMissingFieldError(f'The field `{req_field}` of `personalized_repeat` is missing!') - if personalized_repeat_mode == WEEKLY_REPEAT_MODE: - if type(personalized_data) is not list: - raise SchedulerConfigValueError(f'If the personalized repeat mode is {WEEKLY_REPEAT_MODE}, the field' \ - ' `data` of `personalized_repeat` must be a list of integers.') + @staticmethod + def _valid_personalized_modes(config_dict: ConfigDict): + ''' + Validates if the personalized repeat mode is valid. - types_in_list = {type(val) for val in personalized_data} + Raises an exception if the personalized repeat mode is invalid. - if len(types_in_list) != 1 and int not in types_in_list: - raise SchedulerConfigValueError('The list of days for run in `personalized_repeat` must be integers from 0 (sunday) to 6 (saturday).') + :param config_dict: The config dictionary. - if min(personalized_data) < 0 or max(personalized_data) > 6: - raise SchedulerConfigValueError('The list of days for run in `personalized_repeat` must be integers from 0 (sunday) to 6 (saturday).') - - if personalized_repeat_mode == MONTHLY_REPEAT_MODE: - if type(personalized_data) is not dict: - raise SchedulerConfigValueError(f'If the personalized repeat mode is {MONTHLY_REPEAT_MODE}, the field' \ - ' `data` of `personalized_repeat` must be a dict with fields `mode` and `value`.') - - fields_available = personalized_data.keys() - for req_field in ('mode', 'value'): - if req_field not in fields_available: - raise SchedulerConfigMissingFieldError(f'If the personalized repeat mode is {MONTHLY_REPEAT_MODE}, the field' \ - ' `data` of `personalized_repeat` must be a dict with fields `mode` and `value`.' \ - f' `req_field` is missing!' ) - - personalized_repetion_monthly_mode = personalized_data['mode'] - - if personalized_repetion_monthly_mode not in VALID_MONTHLY_REPEAT_MODES: - valid_monthly_modes = ', '.join(VALID_MONTHLY_REPEAT_MODES) - raise SchedulerConfigValueError(f'The monthly personalized repeat mode must be: {valid_monthly_modes}.') - - personalized_repetion_monthly_value = personalized_data['value'] - if type(personalized_repetion_monthly_value) is not int: - raise SchedulerConfigValueError('The field `value` of `data` in `personalized_repeat` must be a integer, for monthly personalized repeat!') - - if personalized_repetion_monthly_mode == MONTHLY_DAY_X_OCCURRENCE_TYPE: - if personalized_repetion_monthly_value < 1 or personalized_repetion_monthly_value > 31: - raise SchedulerConfigValueError('The field `value` of `data` in `personalized_repeat` must be a integer' \ - ' between 1 and 31, for monthly personalized repeat `day-x`!') - - else: - if personalized_repetion_monthly_value < 0 or personalized_repetion_monthly_value > 6: - raise SchedulerConfigValueError('The field `value` of `data` in `personalized_repeat` must be a integer' \ - f' between 0 and 6, for monthly personalized repeat `{personalized_repetion_monthly_mode}`!') - - - finish_repeat = config_dict['personalized_repeat']['finish'] - if type(finish_repeat) not in (type(None), dict): - raise SchedulerConfigValueError('The field `finish` of `personalized_repeat` must be None or dict!') - - if type(finish_repeat) is dict: - fields_available = finish_repeat.keys() - - for req_field in ('mode', 'value'): - if req_field not in fields_available: - raise SchedulerConfigMissingFieldError('If the field `finish` of `personalized_repeat_mode` is not of '\ - f'type NoneType, it must be a dict with fields `mode` and `value`. The field `{req_field}` is missing!') - - finish_mode = finish_repeat['mode'] - - if finish_mode not in VALID_REPEAT_FINISH: - valid_finish_modes = ', '.join(VALID_REPEAT_FINISH) - raise SchedulerConfigInvalidRepeatModeError(f'The valid finish modes for `personalized_repeat` are: {valid_finish_modes}! `{finish_mode} is invalid!`') - - finish_value = finish_repeat['value'] - - if finish_mode == REPEAT_FINISH_BY_OCCURRENCES: - if type(finish_value) is not int: - raise SchedulerConfigValueError(f'When the field ``mode` of `finish` of `personalized_repeat` is `{REPEAT_FINISH_BY_OCCURRENCES}`, ' \ - 'the value of field `value` must be a integer.') - - if finish_value <= 0: - raise SchedulerConfigValueError(f'When the field `mode` of `finish` of `personalized_repeat` is `{REPEAT_FINISH_BY_OCCURRENCES}`, ' \ - 'the value of field `value` must be a integer greater than 0.') - elif finish_mode == REPEAT_FINISH_BY_DATE: - if type(finish_value) is not datetime.datetime: - raise SchedulerConfigValueError(f'When the field `mode` of `finish` of `personalized_repeat` is `{REPEAT_FINISH_BY_DATE}`, ' \ - f'the value of field `value` must be a string representing a datetime.') - - finish_date = decode_datetimestr(finish_value) - now = datetime.datetime.now() - - if finish_date < now: - raise SchedulerConfigValueError(f'When the field `mode` of `finish` of `personalized_repeat` is `{REPEAT_FINISH_BY_DATE}`, ' \ - f'the value of field `value` must be a datetime greater than now.') \ No newline at end of file + :raises ConfigInvalidRepeatModeError: If the personalized repeat mode is invalid. + ''' + + personalized_repeat = config_dict['personalized_repeat']['mode'] + + if personalized_repeat not in VALID_PERSONALIZED_REPEAT_MODES: + valid_repeat_modes = ', '.join(VALID_PERSONALIZED_REPEAT_MODES) + raise ConfigInvalidRepeatModeError(f'The valid repeats modes for `personalized_repeat` are: {valid_repeat_modes}. `{personalized_repeat}` is not included!') + + @staticmethod + def _valid_personalized_interval(config_dict: ConfigDict): + ''' + Validates if the personalized repeat interval is valid. + + Raises an exception if the personalized repeat interval is invalid. + + :param config_dict: The config dictionary. + + :raises ConfigValueError: If the personalized repeat interval is invalid. + ''' + personalized_interval = config_dict['personalized_repeat']['interval'] + if type(personalized_interval) is not int: + raise ConfigValueError(f'The repeat interval for `personalized_repeat` must be `int`, not `{type(personalized_interval)}`!') + + if personalized_interval <= 0: + raise ConfigValueError(f'The repeat interval for `personalized_repeat` must be a integer greater than 0!') + + @staticmethod + def _valid_personalized_extra_data(config_dict: ConfigDict): + ''' + Validates if the personalized repeat extra data is valid. + + Raises an exception if the personalized repeat extra data is invalid. + + :param config_dict: The config dictionary. + + :raises ConfigValueError: If the personalized repeat extra data is invalid. + ''' + + personalized_data = config_dict['personalized_repeat']['data'] + if type(personalized_data) not in (type(None), list, dict): + raise ConfigValueError(f'The field `data` of `personalized_repeat` must be: None, a list or a dict.') + + @staticmethod + def _valid_personalized_weekly_repeat(config_dict: ConfigDict): + ''' + Validates if the personalized repeat weekly repeat is valid. + + Raises an exception if the personalized repeat weekly repeat is invalid. + + :param config_dict: The config dictionary. + :raises ConfigValueError: If the personalized repeat weekly repeat is invalid. + ''' + + personalized_data = config_dict['personalized_repeat']['data'] + personalized_repeat_mode = config_dict['personalized_repeat']['mode'] + + if personalized_repeat_mode == WEEKLY_REPEAT_MODE: + if type(personalized_data) is not list: + raise ConfigValueError(f'If the personalized repeat mode is {WEEKLY_REPEAT_MODE}, the field' \ + ' `data` of `personalized_repeat` must be a list of integers.') + + types_in_list = {type(val) for val in personalized_data} + + if len(types_in_list) != 1 and int not in types_in_list: + raise ConfigValueError('The list of days for run in `personalized_repeat` must be integers from 0 (sunday) to 6 (saturday).') + + if min(personalized_data) < 0 or max(personalized_data) > 6: + raise ConfigValueError('The list of days for run in `personalized_repeat` must be integers from 0 (sunday) to 6 (saturday).') + + @staticmethod + def _valid_personalized_monthly_repeat(config_dict: ConfigDict): + ''' + Validates if the personalized repeat monthly repeat is valid. + + Raises an exception if the personalized repeat monthly repeat is invalid. + + :param config_dict: The config dictionary. + + :raises ConfigValueError: If the personalized repeat monthly repeat is invalid. + :raises ConfigMissingFieldError: If the personalized repeat monthly repeat is invalid. + ''' + + personalized_data = config_dict['personalized_repeat']['data'] + personalized_repeat_mode = config_dict['personalized_repeat']['mode'] + + if personalized_repeat_mode == MONTHLY_REPEAT_MODE: + if type(personalized_data) is not dict: + raise ConfigValueError(f'If the personalized repeat mode is {MONTHLY_REPEAT_MODE}, the field' \ + ' `data` of `personalized_repeat` must be a dict with fields `mode` and `value`.') + + fields_available = personalized_data.keys() + for req_field in ('mode', 'value'): + if req_field not in fields_available: + raise ConfigMissingFieldError(f'If the personalized repeat mode is {MONTHLY_REPEAT_MODE}, the field' \ + ' `data` of `personalized_repeat` must be a dict with fields `mode` and `value`.' \ + f' `req_field` is missing!' ) + + personalized_repetion_monthly_mode = personalized_data['mode'] + + if personalized_repetion_monthly_mode not in VALID_MONTHLY_REPEAT_MODES: + valid_monthly_modes = ', '.join(VALID_MONTHLY_REPEAT_MODES) + raise ConfigValueError(f'The monthly personalized repeat mode must be: {valid_monthly_modes}.') + + personalized_repetion_monthly_value = personalized_data['value'] + if type(personalized_repetion_monthly_value) is not int: + raise ConfigValueError('The field `value` of `data` in `personalized_repeat` must be a integer, for monthly personalized repeat!') + + if personalized_repetion_monthly_mode == MONTHLY_DAY_X_OCCURRENCE_TYPE: + if personalized_repetion_monthly_value < 1 or personalized_repetion_monthly_value > 31: + raise ConfigValueError('The field `value` of `data` in `personalized_repeat` must be a integer' \ + ' between 1 and 31, for monthly personalized repeat `day-x`!') + + else: + if personalized_repetion_monthly_value < 0 or personalized_repetion_monthly_value > 6: + raise ConfigValueError('The field `value` of `data` in `personalized_repeat` must be a integer' \ + f' between 0 and 6, for monthly personalized repeat `{personalized_repetion_monthly_mode}`!') + + @staticmethod + def _valid_personalized_finish(config_dict: ConfigDict): + ''' + Validates if the personalized repeat finish is valid. + + Raises an exception if the personalized repeat finish is invalid. + + :param config_dict: The config dictionary. + + :raises ConfigValueError: If the personalized repeat finish is invalid. + :raises ConfigMissingFieldError: If the personalized repeat finish is invalid. + :raises ConfigInvalidRepeatModeError: If the personalized repeat finish is invalid. + + ''' + + finish_repeat = config_dict['personalized_repeat']['finish'] + if type(finish_repeat) not in (type(None), dict): + raise ConfigValueError('The field `finish` of `personalized_repeat` must be None or dict!') + + if type(finish_repeat) is dict: + fields_available = finish_repeat.keys() + + for req_field in ('mode', 'value'): + if req_field not in fields_available: + raise ConfigMissingFieldError('If the field `finish` of `personalized_repeat_mode` is not of '\ + f'type NoneType, it must be a dict with fields `mode` and `value`. The field `{req_field}` is missing!') + + finish_mode = finish_repeat['mode'] + + if finish_mode not in VALID_REPEAT_FINISH: + valid_finish_modes = ', '.join(VALID_REPEAT_FINISH) + raise ConfigInvalidRepeatModeError(f'The valid finish modes for `personalized_repeat` are: {valid_finish_modes}! `{finish_mode} is invalid!`') + + finish_value = finish_repeat['value'] + + if finish_mode == REPEAT_FINISH_BY_OCCURRENCES: + if type(finish_value) is not int: + raise ConfigValueError(f'When the field ``mode` of `finish` of `personalized_repeat` is `{REPEAT_FINISH_BY_OCCURRENCES}`, ' \ + 'the value of field `value` must be a integer.') + + if finish_value <= 0: + raise ConfigValueError(f'When the field `mode` of `finish` of `personalized_repeat` is `{REPEAT_FINISH_BY_OCCURRENCES}`, ' \ + 'the value of field `value` must be a integer greater than 0.') + elif finish_mode == REPEAT_FINISH_BY_DATE: + if type(finish_value) is not datetime.datetime: + raise ConfigValueError(f'When the field `mode` of `finish` of `personalized_repeat` is `{REPEAT_FINISH_BY_DATE}`, ' \ + f'the value of field `value` must be a string representing a datetime.') + + finish_date = date_utils.decode_datetimestr(finish_value) + now = datetime.datetime.now() + + if finish_date < now: + raise ConfigValueError(f'When the field `mode` of `finish` of `personalized_repeat` is `{REPEAT_FINISH_BY_DATE}`, ' \ + f'the value of field `value` must be a datetime greater than now.') + @staticmethod + def _valid_personalized_repeat_mode(config_dict: ConfigDict): + ''' + Validates if the personalized repeat mode is valid. + + Raises an exception if the personalized repeat mode is invalid. + + :param config_dict: The config dictionary. + ''' + + repeat_mode = config_dict['repeat_mode'] + + if repeat_mode == PERSONALIZED_REPEAT_MODE: + Config._valid_personalized_required_fields(config_dict) + Config._valid_personalized_modes(config_dict) + Config._valid_personalized_interval(config_dict) + Config._valid_personalized_extra_data(config_dict) + Config._valid_personalized_weekly_repeat(config_dict) + Config._valid_personalized_monthly_repeat(config_dict) + Config._valid_personalized_finish(config_dict) + + @staticmethod + def valid_config(config_dict: ConfigDict) -> None: + ''' + Validates if the config is valid. + + Raises an exception if the config is invalid. + + :param config_dict: The config dictionary. + ''' + + Config._valid_required_fields(config_dict) + Config._valid_start_date_and_timezone(config_dict) + Config._valid_repeat_mode(config_dict) + Config._valid_behavior_after_system_restart(config_dict) + Config._valid_personalized_repeat_mode(config_dict) \ No newline at end of file diff --git a/src/schedule/schedule/config_dict.py b/src/schedule/schedule/config_dict.py new file mode 100644 index 00000000..2e74eb16 --- /dev/null +++ b/src/schedule/schedule/config_dict.py @@ -0,0 +1,91 @@ +from typing import List, Union +from typing_extensions import Literal, TypedDict + +class Finish(TypedDict): + ''' + Specifies when the job should stop running. + + The attribute `mode` can be one of the following: + - never: the job will never stop running. + - occurrence: the job will stop running after runs. + - date: the job will stop running after . + + The attribute `value` is the value of the mode. It can be: + - None: if the mode is never. + - int: if the mode is occurrence. + - str: if the mode is date. + ''' + + mode: Literal['never', 'occurrence', 'date'] + value: Union[None, int, str] + + +class MonthlyRepeatConf(TypedDict): + ''' + Specifies how the job should repeat monthly. + + The attribute `mode` can be one of the following: + - first-weekday: The job will run on the first (sunday, monday, etc) of the month, starting from 0 - sunday. + - last-weekday: The job will run on the last (sunday, monday, etc) of the month, starting from 0 - sunday. + - day-x: The job will run on the day x of the month. If the month doesn't have the day x, it will run on the last day of the month. + + The attribute `value` is the value of the mode. It can be: + - int: if the mode is first-weekday or last-weekday. + - int: if the mode is day-x. + ''' + + mode: Literal['first-weekday', 'last-weekday', 'day-x'] + value: int + +class PersonalizedRepeat(TypedDict): + ''' + Specifies how the job should repeat. + + The attribute `mode` can be one of the following: + - daily: The job will run every days. + - weekly: The job will run every weeks, on the days specified in . + - monthly: The job will run every months, on the days specified in . + - yearly: The job will run every years. + + The attribute `interval` is the interval of the repetition. + + The attribute `data` is the data of the repetition. It can be: + - None: if the mode is daily or yearly. + - List: if the mode is weekly. The list contains the days of the week (starting from 0 - sunday) that the job will run. + - MonthlyRepeatConf: if the mode is monthly. It specifies how the job should repeat monthly. + ''' + + mode: Literal['minutely', 'hourly','daily', 'weekly', 'monthly', 'yearly'] + interval: int + data: Union[None, List, MonthlyRepeatConf] + finish: Finish + +class ConfigDict(TypedDict): + ''' + Specifies the configuration of the job. + + The attribute `start_date` is the date that the job should start running. + + The attribute `timezone` is the timezone of the job. + + The attribute `behavior_after_system_restart` is the behavior of the job after the system restarts. It can be: + - None: The default behavior is to run the job after the system restarts. + - int: If the value is 0, the job will be cancelled. If the value is 1, the job will be reescheduled to run after the system restarts. + If the value is 2, the job will run immediately after the system restarts. + + The attribute `repeat_mode` is the mode of the repetition. It can be: + - no_repeat: The job will run only once. + - daily: The job will run every day. + - weekly: The job will run every week, on the days specified in . + - monthly: The job will run every month, on the days specified in . + - yearly: The job will run every year. + - personalized: The job will run according to the configuration specified in . + + The attribute `personalized_repeat` is the configuration of the repetition. It is only used if the repeat_mode is personalized. + ''' + + start_date: str + timezone: str + behavior_after_system_restart: Union[None, int] + repeat_mode: Literal['no_repeat', 'minutely', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'personalized'] + personalized_repeat: Union[None, PersonalizedRepeat] \ No newline at end of file diff --git a/src/schedule/schedule/constants.py b/src/schedule/schedule/constants.py index 1f5cca47..37fa0381 100644 --- a/src/schedule/schedule/constants.py +++ b/src/schedule/schedule/constants.py @@ -1,25 +1,10 @@ import environ +from sqlalchemy.orm import declarative_base -from sqlalchemy import create_engine -from sqlalchemy.orm import declarative_base, sessionmaker - +SQL_ALCHEMY_BASE = declarative_base() ENV = environ.Env( POSTGRES_SCHED_CONFIG_TABLE_NAME=(str, 'sched_config'), - POSTGRES_SCHED_JOB_TABLE_NAME=(str, 'sched_job'), - - POSTGRES_USER=(str, 'django'), - POSTGRES_PASSWORD=(str, 'c01_password'), - POSTGRES_HOST=(str, 'localhost'), - POSTGRES_PORT=(int, 5432), - POSTGRES_DB=(str, 'c01_prod'), -) - -# SQL ALCHEMY CONFIG - -SQL_ALCHEMY_BASE = declarative_base() -DB_URI = f'postgresql://{ENV("POSTGRES_USER")}:{ENV("POSTGRES_PASSWORD")}@{ENV("POSTGRES_HOST")}:{ENV("POSTGRES_PORT")}/{ENV("POSTGRES_DB")}' -SQL_ALCHEMY_ENGINE = create_engine(DB_URI, echo=False) -SQL_ALCHEMY_DB_SESSION = sessionmaker(bind=SQL_ALCHEMY_ENGINE)() + POSTGRES_SCHED_JOB_TABLE_NAME=(str, 'sched_job')) # SCHEDULE CONFIG @@ -31,6 +16,8 @@ MONTHLY_LAST_WEEKDAY_OCCURRENCE_TYPE = 'last-weekday' NO_REPEAT_MODE = 'no_repeat' +MINUTELY_REPEAT_MODE = 'minutely' +HOURLY_REPEAT_MODE = 'hourly' DAILY_REPEAT_MODE = 'daily' WEEKLY_REPEAT_MODE = 'weekly' MONTHLY_REPEAT_MODE = 'monthly' @@ -41,11 +28,11 @@ REPEAT_FINISH_BY_OCCURRENCES = 'occurrence' REPEAT_FINISH_BY_DATE = 'date' -VALID_REPEAT_MODES = (NO_REPEAT_MODE, DAILY_REPEAT_MODE, WEEKLY_REPEAT_MODE, - MONTHLY_REPEAT_MODE, YEARLY_REPEAT_MODE, PERSONALIZED_REPEAT_MODE) +VALID_REPEAT_MODES = (NO_REPEAT_MODE, MINUTELY_REPEAT_MODE, HOURLY_REPEAT_MODE, DAILY_REPEAT_MODE, + WEEKLY_REPEAT_MODE, MONTHLY_REPEAT_MODE, YEARLY_REPEAT_MODE, PERSONALIZED_REPEAT_MODE) -VALID_PERSONALIZED_REPEAT_MODES = (NO_REPEAT_MODE, DAILY_REPEAT_MODE, WEEKLY_REPEAT_MODE, - MONTHLY_REPEAT_MODE, YEARLY_REPEAT_MODE) +VALID_PERSONALIZED_REPEAT_MODES = (NO_REPEAT_MODE, MINUTELY_REPEAT_MODE, HOURLY_REPEAT_MODE, DAILY_REPEAT_MODE, + WEEKLY_REPEAT_MODE, MONTHLY_REPEAT_MODE, YEARLY_REPEAT_MODE) VALID_MONTHLY_REPEAT_MODES = (MONTHLY_DAY_X_OCCURRENCE_TYPE, MONTHLY_FIRST_WEEKDAY_OCCURRENCE_TYPE, @@ -54,8 +41,6 @@ VALID_REPEAT_FINISH = (REPEAT_FINISH_NEVER, REPEAT_FINISH_BY_OCCURRENCES, REPEAT_FINISH_BY_DATE) VALID_DATETIME_FORMATS = ( - '%Y-%m-%dT%H:%M', - '%Y-%m-%dT%H:%M:%S' '%Y-%m-%d %H:%M:%S', '%d-%m-%Y %H:%M:%S', '%Y-%m-%d %H:%M', @@ -65,4 +50,13 @@ ) REQUIRED_FIELDS = ('start_date', 'timezone', 'repeat_mode') -PERSONALIZED_REQUIRED_FIELDS = ('mode', 'interval', 'data', 'finish') \ No newline at end of file +PERSONALIZED_REQUIRED_FIELDS = ('mode', 'interval', 'data', 'finish') + +# Behavior of the task when the system is restarted + +CANCELL_TASK_ON_RESTART = 0 +RESCHEDULE_TASK_ON_RESTART = 1 +RUN_TASK_IMMEDIATELLY = 2 + +VALID_BEHAVIOR_AFTER_SYSTEM_RESTART = (CANCELL_TASK_ON_RESTART, RESCHEDULE_TASK_ON_RESTART, RUN_TASK_IMMEDIATELLY) +DEFAULT_BEHAVIOR_AFTER_SYSTEM_RESTART = RUN_TASK_IMMEDIATELLY \ No newline at end of file diff --git a/src/schedule/schedule/utils.py b/src/schedule/schedule/date_utils.py similarity index 54% rename from src/schedule/schedule/utils.py rename to src/schedule/schedule/date_utils.py index cac9fe12..6b88c2ac 100644 --- a/src/schedule/schedule/utils.py +++ b/src/schedule/schedule/date_utils.py @@ -1,10 +1,23 @@ import calendar import datetime from typing import Optional - from schedule.constants import * def get_date(year: int, month: int, day: int, hour: int = 0, minute: int = 0, second: int = 0) -> datetime: + ''' + Get a datetime object for the given year, month, day, hour, minute, and second. + + If the given day is invalid, the last day of the month will be used instead. + + :param year: The year. + :param month: The month. + :param day: The day. + :param hour: The hour. + :param minute: The minute. + :param second: The second. + ''' + + try: date = datetime.datetime(year, month, day, hour, minute, second) @@ -15,16 +28,49 @@ def get_date(year: int, month: int, day: int, hour: int = 0, minute: int = 0, se return date def get_last_day_of_month(month: int, year: int) -> int: + ''' + Get the last day of the given month and year. + + :param month: The month. + :param year: The year. + + :return: The last day of the given month and year. + ''' return calendar.monthrange(year, month)[1] -def get_first_weekday_date_of_month(weekday: int, year: int, month: int, hour: int = 0, minute: int = 1, second: int = 1) -> datetime: +def get_first_weekday_date_of_month(weekday: int, year: int, month: int, hour: int = 0, minute: int = 1, second: int = 1) -> datetime.datetime: + ''' + Get the first weekday date of the given month and year. + + :param weekday: The weekday. + :param year: The year. + :param month: The month. + :param hour: The hour. + :param minute: The minute. + :param second: The second. + + :return: The first weekday date of the given month and year. + ''' for day in range(1, 8): date = datetime.datetime(year, month, day, hour, minute, second) if (date.weekday() + 1) % NUM_DAYS_IN_WEEK == weekday: return date return None -def get_last_weekday_date_of_month(weekday: int, year: int, month: int, hour: int = 0, minute: int = 1, second: int = 1) -> datetime: +def get_last_weekday_date_of_month(weekday: int, year: int, month: int, hour: int = 0, minute: int = 1, second: int = 1) -> datetime.datetime: + ''' + Get the last weekday date of the given month and year. + + :param weekday: The weekday. + :param year: The year. + :param month: The month. + :param hour: The hour. + :param minute: The minute. + :param second: The second. + + :return: The last weekday date of the given month and year. + ''' + last_day_of_month = get_last_day_of_month(month, year) for day in range(last_day_of_month - 6, last_day_of_month + 1): date = datetime.datetime(year, month, day, hour, minute, second) @@ -32,7 +78,17 @@ def get_last_weekday_date_of_month(weekday: int, year: int, month: int, hour: in return date return None -def weeks_next_execution_date(base_date, days_of_week, interval_weeks=1): +def weeks_next_execution_date(base_date, days_of_week, interval_weeks=1) -> datetime.datetime: + ''' + Calculate the next execution date based on the given base date, days of week, and interval in weeks. + + :param base_date: The base date. + :param days_of_week: The days of week. + :param interval_weeks: The interval in weeks. + + :return: The next execution date. + ''' + days_of_week.sort() current_day = (base_date.weekday() + 1) % NUM_DAYS_IN_WEEK @@ -50,7 +106,21 @@ def month_next_execution_date(base_date: datetime.datetime, day_x: int = 1, first_weekday_to_run: int = 0, last_weekday_to_run: int = 0, - interval: int = 1): + interval: int = 1) -> datetime.datetime: + + ''' + Calculate the next execution date based on the given base date, ocurrency type, day, first weekday to run, last weekday to run, and interval in months. + + :param base_date: The base date. + :param ocurrency_type: The ocurrency type. + :param day_x: The day. + :param first_weekday_to_run: The first weekday to run. + :param last_weekday_to_run: The last weekday to run. + :param interval: The interval in months. + + :return: The next execution date. + ''' + year = base_date.year + (base_date.month + interval - 1) // NUM_MONTHS_IN_YEAR month = (base_date.month + interval - 1) % NUM_MONTHS_IN_YEAR + 1 @@ -66,7 +136,16 @@ def month_next_execution_date(base_date: datetime.datetime, else: raise ValueError(f'Invalid ocurrency_type: {ocurrency_type}') -def year_next_execution_date(base_date: datetime.datetime, interval: int = 1): +def year_next_execution_date(base_date: datetime.datetime, interval: int = 1) -> datetime.datetime: + ''' + Calculate the next execution date based on the given base date and interval in years. + + :param base_date: The base date. + :param interval: The interval in years. + + :return: The next execution date. + ''' + return get_date(base_date.year + interval, base_date.month, base_date.day, base_date.hour, base_date.minute, @@ -74,18 +153,32 @@ def year_next_execution_date(base_date: datetime.datetime, interval: int = 1): def decode_datetimestr( datetime_str: str) -> Optional[datetime.datetime]: - for f in VALID_DATETIME_FORMATS: - try: - return datetime.datetime.strptime(datetime_str, f) - except ValueError: - pass - return None + ''' + Decode the given datetime string. + + :param datetime_str: The datetime string. + + :return: The decoded datetime string. + ''' + + for f in VALID_DATETIME_FORMATS: + try: + return datetime.datetime.strptime(datetime_str, f) + except ValueError: + pass + return None def apply_timezone(datetime_obj: datetime.datetime, timezone = None) -> datetime.datetime: + ''' + Apply the given timezone to the given datetime object. + + :param datetime_obj: The datetime object. + :param timezone: The timezone. + + :return: The datetime object with the given timezone. + ''' + if timezone is None: return datetime_obj - return timezone.localize(datetime_obj).astimezone(timezone).replace(tzinfo=None) - -def create_db_tables(): - SQL_ALCHEMY_BASE.metadata.create_all(bind=SQL_ALCHEMY_ENGINE) \ No newline at end of file + return timezone.localize(datetime_obj).astimezone(timezone).replace(tzinfo=None) \ No newline at end of file diff --git a/src/schedule/schedule/function_wrapper.py b/src/schedule/schedule/function_wrapper.py index 5525b0cb..6bc9f92c 100644 --- a/src/schedule/schedule/function_wrapper.py +++ b/src/schedule/schedule/function_wrapper.py @@ -1,5 +1,4 @@ from typing import Any, Callable, Dict, List, Any - class FunctionWrapper: def __init__(self, funct: Callable, *args, **kwargs): self.funct: Callable = funct diff --git a/src/schedule/schedule/job.py b/src/schedule/schedule/job.py index 2c7f02fb..85dd6e52 100644 --- a/src/schedule/schedule/job.py +++ b/src/schedule/schedule/job.py @@ -1,13 +1,13 @@ import logging import datetime -from typing import Callable +from typing import Callable, Any -from sqlalchemy import Column, Integer, PickleType, DateTime, ARRAY, ForeignKey, Boolean +from sqlalchemy import Column, Integer, PickleType, DateTime, ForeignKey, Boolean from sqlalchemy.orm import relationship -from schedule.utils import * -from schedule.constants import * +from schedule.constants import (ENV, SQL_ALCHEMY_BASE, CANCELL_TASK_ON_RESTART, + RESCHEDULE_TASK_ON_RESTART, RUN_TASK_IMMEDIATELLY) from schedule.config import Config from schedule.function_wrapper import FunctionWrapper @@ -18,7 +18,6 @@ class CancelJob(object): """ Can be returned from a job to unschedule itself. """ - pass class Job(SQL_ALCHEMY_BASE): @@ -38,15 +37,31 @@ class Job(SQL_ALCHEMY_BASE): job_funct = Column(PickleType, default=None, nullable=False) - def __init__(self, sched_config: Config) -> None: + def __init__(self, sched_config: Config, db_session = None) -> None: + ''' + Create a new job. + + :param sched_config: A dictionary with the job's schedule configuration. + ''' + self.sched_config: Config = sched_config self.num_repeats: int = 0 + self.db_session = db_session + def __lt__(self, other: 'Job') -> bool: assert self.next_run is not None, "must run _schedule_next_run before" assert other.next_run is not None, "must run _schedule_next_run before" return self.next_run < other.next_run + def __eq__(self, other: 'Job') -> bool: + return self.cancelled == other.cancelled and \ + self.sched_config_id == other.sched_config_id and \ + self.num_repeats == other.num_repeats and \ + self.last_run == other.last_run and \ + self.next_run == other.next_run and \ + self.job_funct == other.job_funct + def __repr__(self) -> str: return f"" @@ -54,29 +69,88 @@ def __str__(self) -> str: return f"Job (id={self.id}, sched_config_id={self.sched_config.id}, num_repeats={self.num_repeats}, last_run={self.last_run}, next_run={self.next_run})" def do(self, job_func: Callable, *args, **kwargs): + ''' + Schedule a new job. + + :param job_func: The function to be scheduled. + :param args: The arguments to call the job_func with. + :param kwargs: The keyword arguments to call the job_func with. + + ''' self.job_funct = FunctionWrapper(job_func, *args, **kwargs) self._schedule_first_run() + self.save() def save(self): + ''' + Save the job to the database. + ''' + self.sched_config.save() + + if self.db_session is None: + return - SQL_ALCHEMY_DB_SESSION.add(self) - SQL_ALCHEMY_DB_SESSION.commit() + self.db_session.add(self) + self.db_session.commit() + + def recover(self): + ''' + Ensure that the job is scheduled to run again after a system restart. + ''' + + # Pending task during idle time + if self.next_run < self.sched_config.now(): + if self.sched_config.behavior_after_system_restart == CANCELL_TASK_ON_RESTART: + self.cancel() + elif self.sched_config.behavior_after_system_restart == RESCHEDULE_TASK_ON_RESTART: + self._schedule_next_run() + + elif self.sched_config.behavior_after_system_restart == RUN_TASK_IMMEDIATELLY: + self.run() + + else: + raise ValueError(f'Invalid behavior_after_system_restart: {self.sched_config.behavior_after_system_restart}') + @property def should_run(self) -> bool: - print(f'Job {self} should run? Currentime time: {self.sched_config.now()}') - + ''' + Check if the job should run. + ''' assert self.next_run is not None, 'must run _schedule_next_run before' return self.sched_config.now() >= self.next_run + + def exec_funct(self) -> Any: + ''' + Execute the job function. + + :return: The return value of the job function. + ''' + + if self.job_funct is None: + raise ValueError('job_func is None') + + return self.job_funct() def run(self): + ''' + Run the job. + ''' + if self._is_overdue(self.sched_config.now()): logger.debug(f'Cancelling job {self}.\n\tReason: The job is overdue.') return CancelJob logger.debug('Running job %s', self) - ret = self.job_funct() + + try: + ret = self.exec_funct() + + except Exception as e: + logger.exception('Error running job %s', self) + logger.debug(f'Cancelling job {self}.\n\tReason: Exception raised.') + return CancelJob self.num_repeats += 1 if self._achieved_max_repeats(): @@ -84,6 +158,7 @@ def run(self): return CancelJob self.last_run = self.sched_config.now() + self._schedule_next_run() # The repeat_mode is no_repeat, so we cancel the job @@ -94,23 +169,41 @@ def run(self): if self._is_overdue(self.next_run): logger.debug(f'Cancelling next job {self} run.\n\tReason: The job is overdue.') return CancelJob - + return ret def cancel(self): + ''' + Cancel the job. + ''' self.cancelled = True - self.save() def _schedule_first_run(self) -> None: + ''' + Schedule the first run of the job. + ''' self.next_run = self.sched_config.first_run_date() - self.save() - def _schedule_next_run(self) -> None: + def _schedule_next_run(self) -> None: + ''' + Schedule the next run of the job. + ''' self.next_run = self.sched_config.next_run_date(self.next_run) - self.save() + + # If the next run is overdue, we schedule the next run. + # This can happen if the system is down for a long time + while self.next_run is not None and self._is_overdue(self.next_run): + logger.debug(f'Job {self} is overdue. Scheduling next run.') + self.next_run = self.sched_config.next_run_date(self.next_run) def _is_overdue(self, when: datetime.datetime) -> bool: + ''' + Check if the job is overdue. + ''' return self.sched_config.max_datetime is not None and when > self.sched_config.max_datetime def _achieved_max_repeats(self) -> bool: + ''' + Check if the job achieved the max repeats. + ''' return self.sched_config.max_repeats is not None and self.num_repeats >= self.sched_config.max_repeats \ No newline at end of file diff --git a/src/schedule/schedule/schedule.py b/src/schedule/schedule/schedule.py new file mode 100644 index 00000000..964da51d --- /dev/null +++ b/src/schedule/schedule/schedule.py @@ -0,0 +1,157 @@ +import time +import logging +from typing import Callable, List + +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker + +from schedule.constants import SQL_ALCHEMY_DB_SESSION, SQL_ALCHEMY_ENGINE, SQL_ALCHEMY_BASE +from schedule.config import ConfigDict, Config +from schedule.job import Job, CancelJob + +logger = logging.getLogger('scheduler') +logger.setLevel(logging.DEBUG) + +class ScheduleError(Exception): + """Base schedule exception""" + + pass + +class ScheduleValueError(ScheduleError): + """Base schedule value error""" + + pass + +class IntervalError(ScheduleValueError): + """An improper interval was used""" + + pass + +class Scheduler: + def __init__(self, + persist_task: bool = False, + db_host: str = 'localhost', + db_port: str = 5432, + db_user: str = 'sched_user', + db_pass: str = 'sched_pass', + db_db: str = 'sched_db' + ) -> None: + + self.jobs: List[Job] = list() + self.db_session = None + self.db_engine = None + + if persist_task: + for arg in (db_host, db_port, db_user, db_pass, db_db): + assert arg is not None, "Must provide all arguments for persisting tasks" + + self._create_db_session(db_host, db_port, db_user, db_pass, db_db) + self._create_db_tables() + self._load_jobs_from_db() + + def _create_db_session(self, db_host: str, db_port: str, db_user: str, db_pass: str, db_db: str,): + db_uri = f'postgresql://{db_user}:{db_pass}@{db_host}:{db_port}/{db_db}' + + self.db_engine = create_engine(db_uri) + self.db_session = sessionmaker(bind=self.db_engine)() + + def _create_db_tables(self) -> None: + SQL_ALCHEMY_BASE.metadata.create_all(bind=self.db_engine) + + def run_pending(self) -> None: + ''' + Run all jobs that are scheduled to run. + + Please note that it is *intended behavior that run_pending() + does not run missed jobs*. For example, if you've registered a job + that should run every minute and you only call run_pending() + in one hour increments then your job won't be run 60 times in + between but only once. + ''' + runnable_jobs = (job for job in self.jobs if job.should_run) + for job in sorted(runnable_jobs): + self._run_job(job) + + def _run_job(self, job: "Job") -> None: + ret = job.run() + job.save() + + if isinstance(ret, CancelJob) or ret is CancelJob: + self.cancel_job(job) + + def schedule_job(self, sched_config_dict: ConfigDict, job_func: Callable, *job_args, **job_kwargs) -> Job: + ''' + Schedule a new job. + + :param sched_config_dict: A dictionary with the job's schedule configuration. + :param job_func: The function to be scheduled. + :param job_args: Arguments passed to `job_func` when the job runs. + :param job_kwargs: Keyword arguments passed to `job_func` when the job runs. + + :return: The scheduled job. + ''' + logger.debug('Scheduling job "%s" %s %s', job_func.__name__, job_args, job_kwargs) + + sched_config = Config(self.db_session) + sched_config.load_config(sched_config_dict) + + new_job = Job(sched_config, self.db_session) + new_job.do(job_func, *job_args, **job_kwargs) + + self.jobs.append(new_job) + + return new_job + + def cancel_job(self, job: Job) -> None: + ''' + Delete a scheduled job. + + :param job: The job to be unscheduled + ''' + try: + logger.debug('Cancelling job "%s"', job) + + job.cancel() + job.save() + + self.jobs.remove(job) + + except ValueError: + logger.debug('Cancelling not-scheduled job "%s"', job) + + def _load_jobs_from_db(self) -> None: + ''' + Recover jobs from the database if the job is not cancelled. + ''' + + logger.debug('Recovering jobs') + retrieved_jobs = SQL_ALCHEMY_DB_SESSION.query(Job).filter(Job.cancelled == False).all() + + self.jobs = list() + + for job in sorted(retrieved_jobs): + job.recover() + + if not job.cancelled: + self.jobs.append(job) + + def cancel_all_jobs(self) -> None: + ''' + Clear all scheduled jobs. + ''' + logger.debug('Cancelling all jobs') + + for job in self.jobs: + job.cancel() + + self.jobs.clear() + + def run_all(self, delay_seconds: int = 0) -> None: + ''' + Run all jobs regardless if they are scheduled to run or not. + + :param delay_seconds: The delay in seconds between each job + ''' + for job in self.jobs: + self._run_job(job) + time.sleep(delay_seconds) \ No newline at end of file diff --git a/src/schedule/schedule/scheduler.py b/src/schedule/schedule/scheduler.py deleted file mode 100644 index f95bd323..00000000 --- a/src/schedule/schedule/scheduler.py +++ /dev/null @@ -1,108 +0,0 @@ -import time -import logging -from typing import Callable, List - -from schedule.constants import * -from schedule.utils import * -from schedule.config import Config -from schedule.job import Job, CancelJob - -logger = logging.getLogger('scheduler') -logger.setLevel(logging.DEBUG) - -class ScheduleError(Exception): - """Base schedule exception""" - - pass - -class ScheduleValueError(ScheduleError): - """Base schedule value error""" - - pass - -class IntervalError(ScheduleValueError): - """An improper interval was used""" - - pass - -class Scheduler: - def __init__(self) -> None: - self.jobs: List[Job] = list() - - create_db_tables() - self.recover_jobs() - - def run_pending(self) -> None: - """ - Run all jobs that are scheduled to run. - - Please note that it is *intended behavior that run_pending() - does not run missed jobs*. For example, if you've registered a job - that should run every minute and you only call run_pending() - in one hour increments then your job won't be run 60 times in - between but only once. - """ - runnable_jobs = (job for job in self.jobs if job.should_run) - for job in sorted(runnable_jobs): - self._run_job(job) - - def _run_job(self, job: "Job") -> None: - ret = job.run() - if isinstance(ret, CancelJob) or ret is CancelJob: - self.cancel_job(job) - - def schedule_job(self, scheduler_config: Config, job_func: Callable, *job_args, **job_kwargs) -> Job: - """ - Add a job to the schedule. - - :param job: The job to be added - """ - new_job = Job(scheduler_config) - new_job.do(job_func, *job_args, **job_kwargs) - self.jobs.append(new_job) - - return new_job - - def cancel_job(self, job: Job) -> None: - """ - Delete a scheduled job. - - :param job: The job to be unscheduled - """ - try: - logger.debug('Cancelling job "%s"', job) - job.cancel() - self.jobs.remove(job) - - except ValueError: - logger.debug('Cancelling not-scheduled job "%s"', job) - - def recover_jobs(self) -> None: - """ - Recover jobs from the database if the job is not cancelled. - """ - - logger.debug('Recovering jobs') - self.jobs = SQL_ALCHEMY_DB_SESSION.query(Job).filter(Job.cancelled == False).all() - self.run_pending() - - def cancel_all_jobs(self) -> None: - """ - Clear all scheduled jobs. - """ - logger.debug('Cancelling all jobs') - - for job in self.jobs: - job.cancel() - - self.jobs.clear() - - def run_all(self, delay_seconds: int = 0) -> None: - """ - Run all jobs regardless if they are scheduled to run or not. - - :param delay_seconds: The delay in seconds between each job - """ - for job in self.jobs: - self._run_job(job) - time.sleep(delay_seconds) \ No newline at end of file diff --git a/src/schedule/setup.py b/src/schedule/setup.py index 4b549887..ce6e2c27 100644 --- a/src/schedule/setup.py +++ b/src/schedule/setup.py @@ -14,8 +14,12 @@ license="MIT", author='Elves Rodrigues', packages=setuptools.find_packages(), - # In production we may want to use the psycopg2 package itself, I'm using - # the psycopg2-binary package here to avoid problems with external - # libraries - install_requires=['SQLAlchemy==2.0.7', 'pytz==2022.1'] + install_requires=[ + 'mock-alchemy==0.2.6', + 'psycopg2-binary==2.9.6', + 'python-environ==0.4.54', + 'pytz==2023.3', + 'SQLAlchemy==2.0.10', + 'typing_extensions==4.5.0', + ] ) diff --git a/src/schedule/tests/test_config.py b/src/schedule/tests/test_config.py new file mode 100644 index 00000000..b97d4dbb --- /dev/null +++ b/src/schedule/tests/test_config.py @@ -0,0 +1,219 @@ +import unittest +from mock_alchemy.mocking import UnifiedAlchemyMagicMock + +import datetime +import unittest + +from schedule.constants import * +from schedule.config import (ConfigDict, + Config, + ConfigInvalidRepeatModeError, + ConfigMissingFieldError, + ConfigValueError, + REQUIRED_FIELDS) + +class TestConfig(unittest.TestCase): + def setUp(self) -> None: + self.config_dict: ConfigDict = { + 'start_date': None, + 'repeat_mode': None, + 'timezone': 'America/Sao_Paulo', + } + self._fill_start_date() + + self.db_session = UnifiedAlchemyMagicMock() + + def _fill_start_date(self): + # Fill the field start date with a valid datetime + now = datetime.datetime.now() + datetime.timedelta(days=1) + self.config_dict['start_date'] = now.strftime(VALID_DATETIME_FORMATS[0]) + + def _fill_personalized_repeat(self): + self.config_dict['repeat_mode'] = PERSONALIZED_REPEAT_MODE + self.config_dict['personalized_repeat'] = { + 'mode': 'daily', + 'interval': 1, #invalid, must be a int greater than 0 + 'data': None, + 'finish': None + } + + def test_raise_exception_if_missing_required_fields(self): + with self.assertRaises(ConfigMissingFieldError): + for req_field in REQUIRED_FIELDS: + fields_with_missing_required_field = list(REQUIRED_FIELDS) + fields_with_missing_required_field.remove(req_field) + config_dict: ConfigDict = {field: None for field in fields_with_missing_required_field} + Config.valid_config(config_dict) + + def test_raise_exception_with_invalid_start_date(self): + now = datetime.datetime.now() + + past_date = now - datetime.timedelta(days=1) + past_date_str = past_date.strftime(VALID_DATETIME_FORMATS[0]) + + valid_date = now + datetime.timedelta(days=1) + invalid_format_date = valid_date.strftime("%m/%d/%Y, %H:%M:%S") + + for invalid_input in (None, invalid_format_date, past_date_str): + self.config_dict['start_date'] = invalid_input + with self.assertRaises(ConfigValueError): + Config.valid_config(self.config_dict) + + def test_raise_exception_with_invalid_repeat_mode(self): + self.config_dict['repeat_mode'] = 'unknow_repeat_mode' + with self.assertRaises(ConfigInvalidRepeatModeError): + Config.valid_config(self.config_dict) + + def test_raise_if_personalized_repeat_value_is_not_dict(self): + self.config_dict['repeat_mode'] = PERSONALIZED_REPEAT_MODE + self.config_dict['personalized_repeat'] = None + with self.assertRaises(ConfigValueError): + Config.valid_config(self.config_dict) + + def test_raise_if_personalized_repeat_is_missing_required_fields(self): + with self.assertRaises(ConfigMissingFieldError): + for req_field in PERSONALIZED_REQUIRED_FIELDS: + fields_with_missing_required_field = list(PERSONALIZED_REQUIRED_FIELDS) + fields_with_missing_required_field.remove(req_field) + config_dict: ConfigDict = {field: None for field in fields_with_missing_required_field} + Config.valid_config(config_dict) + + def test_raise_if_personalized_repeat_has_invalid_repeat_interval(self): + self._fill_personalized_repeat() + + self.config_dict['personalized_repeat']['interval'] = '1' + + with self.assertRaises(ConfigValueError): + Config.valid_config(self.config_dict) + + self.config_dict['personalized_repeat']['interval'] = -1 + with self.assertRaises(ConfigValueError): + Config.valid_config(self.config_dict) + + def test_raise_if_personalized_repeat_has_invalid_repeat_mode(self): + self._fill_personalized_repeat() + + self.config_dict['personalized_repeat']['mode'] = 'unknow_mode' + + with self.assertRaises(ConfigInvalidRepeatModeError): + Config.valid_config(self.config_dict) + + def test_raise_if_personalized_repeat_mode_has_invalid_weekly_config(self): + self._fill_personalized_repeat() + + self.config_dict['personalized_repeat']['mode'] = WEEKLY_REPEAT_MODE + + with self.assertRaises(ConfigValueError): + Config.valid_config(self.config_dict) + + self.config_dict['personalized_repeat']['data'] = [] + + with self.assertRaises(ConfigValueError): + Config.valid_config(self.config_dict) + + self.config_dict['personalized_repeat']['data'] = [7] + + with self.assertRaises(ConfigValueError): + Config.valid_config(self.config_dict) + + self.config_dict['personalized_repeat']['data'] = [-1] + + with self.assertRaises(ConfigValueError): + Config.valid_config(self.config_dict) + + def test_raise_if_personalized_repeat_mode_has_invalid_monthly_config(self): + self._fill_personalized_repeat() + + self.config_dict['personalized_repeat']['mode'] = MONTHLY_REPEAT_MODE + with self.assertRaises(ConfigValueError): + Config.valid_config(self.config_dict) + + self.config_dict['personalized_repeat']['data'] = {} + with self.assertRaises(ConfigMissingFieldError): + Config.valid_config(self.config_dict) + + required_fields = ['mode', 'value'] + for req_field in required_fields: + fields_with_missing_required_field = required_fields.copy() + fields_with_missing_required_field.remove(req_field) + self.config_dict['personalized_repeat']['data'] = {field: None for field in fields_with_missing_required_field} + with self.assertRaises(ConfigMissingFieldError): + Config.valid_config(self.config_dict) + + # Personalized monthly repeat type of type DAY-X must receive a integer in the field `value` of + # the dict `data`, and must be between 1 and 31. + for invalid_value in ['-1', 0, 32]: + self.config_dict['personalized_repeat']['data'] = { + 'mode': MONTHLY_DAY_X_OCCURRENCE_TYPE, + 'value':invalid_value + } + with self.assertRaises(ConfigValueError): + Config.valid_config(self.config_dict) + + # Personalized monthly repeat type of type first-weekday or last-weekday must receive a integer in the field `value` of + # the dict `data`, and must be between 0 and 6. + for mode in (MONTHLY_FIRST_WEEKDAY_OCCURRENCE_TYPE, MONTHLY_LAST_WEEKDAY_OCCURRENCE_TYPE): + for invalid_value in ['-1', -1, 7]: + self.config_dict['personalized_repeat']['data'] = { + 'mode': mode, + 'value':invalid_value + } + with self.assertRaises(ConfigValueError): + Config.valid_config(self.config_dict) + + def test_raise_if_personalized_repeat_mode_finish_has_invalid_config(self): + self._fill_personalized_repeat() + + self.config_dict['personalized_repeat']['finish'] = {} + + with self.assertRaises(ConfigMissingFieldError): + Config.valid_config(self.config_dict) + + self.config_dict['personalized_repeat']['finish'] = { + 'mode': 'unknown_mode', + 'value': None + } + + with self.assertRaises(ConfigInvalidRepeatModeError): + Config.valid_config(self.config_dict) + + for invalid_input in ('-100', 0): + self.config_dict['personalized_repeat']['finish'] = { + 'mode': REPEAT_FINISH_BY_OCCURRENCES, + 'value': invalid_input + } + + with self.assertRaises(ConfigValueError): + Config.valid_config(self.config_dict) + + now = datetime.datetime.now() + + past_date = now - datetime.timedelta(days=1) + past_date_str = past_date.strftime(VALID_DATETIME_FORMATS[0]) + + valid_date = now + datetime.timedelta(days=1) + invalid_format_date = valid_date.strftime("%m/%d/%Y, %H:%M:%S") + + for invalid_input in (None, invalid_format_date, past_date_str): + self.config_dict['personalized_repeat']['finish'] = { + 'mode': REPEAT_FINISH_BY_DATE, + 'value': invalid_input + } + + with self.assertRaises(ConfigValueError): + Config.valid_config(self.config_dict) + + def test_if_config_can_be_retrieved_from_db(self): + self._fill_personalized_repeat() + + Config.valid_config(self.config_dict) + + config = Config(self.db_session) + config.load_config(self.config_dict) + + self.db_session.add(config) + self.db_session.commit() + + config_from_db = self.db_session.query(Config).first() + + self.assertTrue(config_from_db == config) \ No newline at end of file diff --git a/src/schedule/tests/test_date_utils.py b/src/schedule/tests/test_date_utils.py index ec70b605..77bf11c4 100644 --- a/src/schedule/tests/test_date_utils.py +++ b/src/schedule/tests/test_date_utils.py @@ -1,5 +1,5 @@ import unittest -from date_utils import * +from schedule.date_utils import * class DateUtilsTest(unittest.TestCase): def test_date_date_with_invalid_date(self): diff --git a/src/schedule/tests/test_job.py b/src/schedule/tests/test_job.py index 66df223b..b8739d5b 100644 --- a/src/schedule/tests/test_job.py +++ b/src/schedule/tests/test_job.py @@ -1,4 +1,143 @@ import unittest +from mock_alchemy.mocking import UnifiedAlchemyMagicMock +from datetime import timedelta, datetime +from schedule.job import Job +from schedule.config import Config +from schedule.function_wrapper import FunctionWrapper +from schedule.constants import (VALID_DATETIME_FORMATS, + CANCELL_TASK_ON_RESTART, + RESCHEDULE_TASK_ON_RESTART, + RUN_TASK_IMMEDIATELLY) class JobTest(unittest.TestCase): - pass + def setUp(self): + self.session = UnifiedAlchemyMagicMock() + + now = datetime.now() + timedelta(minutes=1) + start_date = now.strftime(VALID_DATETIME_FORMATS[0]) + + self.config_dict = { + 'start_date': start_date, + 'repeat_mode': 'daily', + 'timezone': 'America/Sao_Paulo', + } + + self.config = Config(self.session) + self.config.load_config(self.config_dict) + + self.job = Job(self.config, self.session) + + def test_check_if_can_retrieve_job_from_db(self): + funct = FunctionWrapper(lambda s: s, 'test') + self.job.job_funct = funct + + self.session.add(self.job) + self.session.commit() + + job_from_db = self.session.query(Job).first() + + self.assertTrue(job_from_db == self.job) + + def test_if_job_should_run_if_in_past(self): + now = self.config.now() + + self.job.next_run = now - timedelta(seconds=1) + + self.assertTrue(self.job.should_run) + + def test_if_job_should_run_if_in_future(self): + now = self.config.now() + self.job.next_run = now + timedelta(seconds=60) + self.assertFalse(self.job.should_run) + + def test_if_job_should_run_if_now(self): + now = self.config.now() + self.job.next_run = now + self.assertTrue(self.job.should_run) + + def test_if_job_exec_funct(self): + self.job.job_funct = FunctionWrapper(lambda s: s, 'test') + + self.assertEqual(self.job.exec_funct(), 'test') + + def test_check_if_is_overdue(self): + now = self.config.now() + self.job.sched_config.max_datetime = now + self.assertTrue(self.job._is_overdue(now + timedelta(seconds=1))) + + def test_first_run_date(self): + start_date = datetime.strptime(self.config_dict['start_date'], VALID_DATETIME_FORMATS[0]) + + self.job._schedule_first_run() + + self.assertEqual(self.job.next_run, start_date) + + def test_next_run_date(self): + start_date = datetime.strptime(self.config_dict['start_date'], VALID_DATETIME_FORMATS[0]) + + self.job._schedule_first_run() + self.job._schedule_next_run() + + self.assertEqual(self.job.next_run, start_date + timedelta(days=1)) + + def test_cancel_job_after_restart(self): + past_date = self.config.now() - timedelta(days=1) + self.job.next_run = past_date + + # When the job is recovered, the next_run is in the past and the behavior_after_system_restart + # is set to CANCELL_TASK_ON_RESTART, the atributte canceled should be set to True + + self.job.sched_config.behavior_after_system_restart = CANCELL_TASK_ON_RESTART + self.job.recover() + + self.assertTrue(self.job.cancelled) + + def test_reschedule_job_after_restart(self): + past_date = self.config.now() - timedelta(days=1) + self.job.next_run = past_date + + # When the job is recovered, the next_run is in the past and the behavior_after_system_restart + # is set to RESCHEDULE_TASK_ON_RESTART, the next_run should be rescheduled to the current date + + self.job.sched_config.behavior_after_system_restart = RESCHEDULE_TASK_ON_RESTART + self.job.recover() + + next_run = past_date + timedelta(days=1) + + self.assertEqual(self.job.next_run, next_run) + + def test_run_job_immediatelly_after_restart(self): + past_date = self.config.now() - timedelta(days=1) + self.job.next_run = past_date + + # When the job is recovered, the next_run is in the past and the behavior_after_system_restart + # is set to RUN_TASK_IMMEDIATELLY, the job should be run immediatelly + + self.job.job_funct = FunctionWrapper(lambda s: s, 'test') + self.job.sched_config.behavior_after_system_restart = RUN_TASK_IMMEDIATELLY + + self.assertIsNone(self.job.last_run) + + self.job.recover() + + self.assertIsNotNone(self.job.last_run) + + def test_job_run(self): + self.job.job_funct = FunctionWrapper(lambda s: s, 'test') + + now = self.config.now() + self.job.next_run = now + + ret = self.job.run() + + self.assertEqual(ret, 'test') + + def test_count_number_of_runs(self): + self.job.job_funct = FunctionWrapper(lambda s: s, 'test') + + now = self.config.now() + self.job.next_run = now + + self.job.run() + + self.assertEqual(self.job.num_repeats, 1) \ No newline at end of file From 44b84ef3f04b84071dac4735e032a463649216ae Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Fri, 28 Apr 2023 09:34:02 -0300 Subject: [PATCH 51/89] Job can self stop --- src/schedule/schedule/job.py | 50 ++++++++++++++++++++++++------- src/schedule/schedule/schedule.py | 10 +++---- src/schedule/tests/test_job.py | 2 +- 3 files changed, 45 insertions(+), 17 deletions(-) diff --git a/src/schedule/schedule/job.py b/src/schedule/schedule/job.py index 85dd6e52..8605b720 100644 --- a/src/schedule/schedule/job.py +++ b/src/schedule/schedule/job.py @@ -3,7 +3,7 @@ import datetime from typing import Callable, Any -from sqlalchemy import Column, Integer, PickleType, DateTime, ForeignKey, Boolean +from sqlalchemy import Column, Integer, PickleType, DateTime, ForeignKey, String from sqlalchemy.orm import relationship from schedule.constants import (ENV, SQL_ALCHEMY_BASE, CANCELL_TASK_ON_RESTART, @@ -14,9 +14,15 @@ logger = logging.getLogger('scheduler_job') logger.setLevel(logging.DEBUG) +class CancelledJob(object): + """ + Returned by a job when it is cancelled. + """ + pass + class CancelJob(object): """ - Can be returned from a job to unschedule itself. + Can be returned by a job to request its cancellation. """ pass @@ -25,7 +31,8 @@ class Job(SQL_ALCHEMY_BASE): id = Column(Integer, primary_key=True) - cancelled = Column(Boolean, default=False) + cancelled_at = Column(DateTime) + cancelled_reason = Column(String) sched_config_id = Column(Integer, ForeignKey('sched_config.id')) sched_config = relationship('Config', backref='jobs', lazy=True, uselist=False) @@ -55,7 +62,7 @@ def __lt__(self, other: 'Job') -> bool: return self.next_run < other.next_run def __eq__(self, other: 'Job') -> bool: - return self.cancelled == other.cancelled and \ + return self.cancelled_at == other.cancelled_at and \ self.sched_config_id == other.sched_config_id and \ self.num_repeats == other.num_repeats and \ self.last_run == other.last_run and \ @@ -140,7 +147,10 @@ def run(self): if self._is_overdue(self.sched_config.now()): logger.debug(f'Cancelling job {self}.\n\tReason: The job is overdue.') - return CancelJob + + self.cancel(f'The job is overdue.') + + return CancelledJob logger.debug('Running job %s', self) @@ -150,33 +160,51 @@ def run(self): except Exception as e: logger.exception('Error running job %s', self) logger.debug(f'Cancelling job {self}.\n\tReason: Exception raised.') - return CancelJob + + self.cancel(f'Exception raised: {e}') + + return CancelledJob self.num_repeats += 1 if self._achieved_max_repeats(): logger.debug(f'Cancelling job {self}.\n\tReason: Max repeats achieved ({self.cancel_after_max_repeats})') - return CancelJob + + self.cancel(f'Max repeats achieved ({self.cancel_after_max_repeats})') + + return CancelledJob self.last_run = self.sched_config.now() + if isinstance(ret, CancelJob) or ret is CancelJob: + logger.debug(f'Cancelling job {self}.\n\tReason: CancelJob returned.') + + self.cancel(f'CancelJob returned.') + + return CancelledJob + self._schedule_next_run() # The repeat_mode is no_repeat, so we cancel the job if self.next_run is None: logger.debug(f'Cancelling job {self}.\n\tReason: No more runs.') - return CancelJob + return CancelledJob if self._is_overdue(self.next_run): logger.debug(f'Cancelling next job {self} run.\n\tReason: The job is overdue.') - return CancelJob + return CancelledJob return ret - def cancel(self): + def cancel(self, reason: str = None): ''' Cancel the job. ''' - self.cancelled = True + # The job is already cancelled + if self.cancelled_at is not None: + return + + self.cancelled_at = self.sched_config.now() + self.cancelled_reason = reason def _schedule_first_run(self) -> None: ''' diff --git a/src/schedule/schedule/schedule.py b/src/schedule/schedule/schedule.py index 964da51d..9b11482d 100644 --- a/src/schedule/schedule/schedule.py +++ b/src/schedule/schedule/schedule.py @@ -7,7 +7,7 @@ from schedule.constants import SQL_ALCHEMY_DB_SESSION, SQL_ALCHEMY_ENGINE, SQL_ALCHEMY_BASE from schedule.config import ConfigDict, Config -from schedule.job import Job, CancelJob +from schedule.job import Job, CancelledJob logger = logging.getLogger('scheduler') logger.setLevel(logging.DEBUG) @@ -76,7 +76,7 @@ def _run_job(self, job: "Job") -> None: ret = job.run() job.save() - if isinstance(ret, CancelJob) or ret is CancelJob: + if isinstance(ret, CancelledJob) or ret is CancelledJob: self.cancel_job(job) def schedule_job(self, sched_config_dict: ConfigDict, job_func: Callable, *job_args, **job_kwargs) -> Job: @@ -102,7 +102,7 @@ def schedule_job(self, sched_config_dict: ConfigDict, job_func: Callable, *job_a return new_job - def cancel_job(self, job: Job) -> None: + def cancel_job(self, job: Job, reason: str = None) -> None: ''' Delete a scheduled job. @@ -111,7 +111,7 @@ def cancel_job(self, job: Job) -> None: try: logger.debug('Cancelling job "%s"', job) - job.cancel() + job.cancel(reason) job.save() self.jobs.remove(job) @@ -125,7 +125,7 @@ def _load_jobs_from_db(self) -> None: ''' logger.debug('Recovering jobs') - retrieved_jobs = SQL_ALCHEMY_DB_SESSION.query(Job).filter(Job.cancelled == False).all() + retrieved_jobs = SQL_ALCHEMY_DB_SESSION.query(Job).filter(Job.cancelled_at is None).all() self.jobs = list() diff --git a/src/schedule/tests/test_job.py b/src/schedule/tests/test_job.py index b8739d5b..4a4014d7 100644 --- a/src/schedule/tests/test_job.py +++ b/src/schedule/tests/test_job.py @@ -90,7 +90,7 @@ def test_cancel_job_after_restart(self): self.job.sched_config.behavior_after_system_restart = CANCELL_TASK_ON_RESTART self.job.recover() - self.assertTrue(self.job.cancelled) + self.assertTrue(self.job.cancelled_at is not None) def test_reschedule_job_after_restart(self): past_date = self.config.now() - timedelta(days=1) From ae4459402542678a70d100d86fac05efffa418b8 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Fri, 28 Apr 2023 10:02:35 -0300 Subject: [PATCH 52/89] Test to check if a job can self cancel --- src/schedule/tests/test_job.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/schedule/tests/test_job.py b/src/schedule/tests/test_job.py index 4a4014d7..d758a576 100644 --- a/src/schedule/tests/test_job.py +++ b/src/schedule/tests/test_job.py @@ -1,7 +1,7 @@ import unittest from mock_alchemy.mocking import UnifiedAlchemyMagicMock from datetime import timedelta, datetime -from schedule.job import Job +from schedule.job import Job, CancelJob from schedule.config import Config from schedule.function_wrapper import FunctionWrapper from schedule.constants import (VALID_DATETIME_FORMATS, @@ -140,4 +140,14 @@ def test_count_number_of_runs(self): self.job.run() - self.assertEqual(self.job.num_repeats, 1) \ No newline at end of file + self.assertEqual(self.job.num_repeats, 1) + + def test_job_can_self_cancel(self): + self.job.job_funct = FunctionWrapper(lambda: CancelJob) + + now = self.config.now() + self.job.next_run = now + + self.job.run() + + self.assertTrue(self.job.cancelled_at is not None) \ No newline at end of file From dbf0a876fcb03c7dabc25fc2ef68442cb41ed21c Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Fri, 28 Apr 2023 10:02:58 -0300 Subject: [PATCH 53/89] Minor refactoring --- scheduler/src/scheduler.py | 12 +++++++++--- scheduler/src/settings.py | 6 ++++++ src/schedule/schedule/schedule.py | 6 +++--- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/scheduler/src/scheduler.py b/scheduler/src/scheduler.py index 324af732..9c4c04de 100644 --- a/scheduler/src/scheduler.py +++ b/scheduler/src/scheduler.py @@ -2,8 +2,9 @@ from time import sleep import threading import ujson -from schedule import Scheduler as Schedule -from schedule import Config as ScheduleConfig +from schedule.schedule import Schedule +from schedule.config import Config as ScheduleConfig +from schedule.job import CancelJob import requests from kafka import KafkaConsumer @@ -24,7 +25,12 @@ def run_crawler(crawler_id, action): class Scheduler: def __init__(self, jobs): self.jobs = jobs - self.scheduler = Schedule() + self.scheduler = Schedule(connect_db=True, + db_host=settings.DB_HOST, + db_port=settings.DB_PORT, + db_user=settings.DB_USER, + db_pass=settings.DB_PASS, + db_db=settings.DB_DB) def __run_task_consumer(self): # Generates a random name for the consumer diff --git a/scheduler/src/settings.py b/scheduler/src/settings.py index e176e4d2..fb598935 100644 --- a/scheduler/src/settings.py +++ b/scheduler/src/settings.py @@ -15,3 +15,9 @@ TASK_TOPIC = os.getenv('TASK_TOPIC', KAFKA_TOPIC_PREFIX + 'task_topic') TASK_DATA_CONSUMER_GROUP = os.getenv('TASK_DATA_CONSUMER_DATA', KAFKA_TOPIC_PREFIX + '.task_data_group') RUN_CRAWLER_URL = "http://web:8000" + +DB_HOST = os.getenv('DB_HOST', 'db') +DB_PORT = os.getenv('DB_PORT', '5432') +DB_USER = os.getenv('POSTGRES_USER', 'django') +DB_PASS = os.getenv('POSTGRES_PASSWORD', 'c01_password') +DB_DB = os.getenv('POSTGRES_DB', 'c01_prod') \ No newline at end of file diff --git a/src/schedule/schedule/schedule.py b/src/schedule/schedule/schedule.py index 9b11482d..096614c4 100644 --- a/src/schedule/schedule/schedule.py +++ b/src/schedule/schedule/schedule.py @@ -27,9 +27,9 @@ class IntervalError(ScheduleValueError): pass -class Scheduler: +class Schedule: def __init__(self, - persist_task: bool = False, + connect_db: bool = False, db_host: str = 'localhost', db_port: str = 5432, db_user: str = 'sched_user', @@ -41,7 +41,7 @@ def __init__(self, self.db_session = None self.db_engine = None - if persist_task: + if connect_db: for arg in (db_host, db_port, db_user, db_pass, db_db): assert arg is not None, "Must provide all arguments for persisting tasks" From 776cdb8ea065983e95e09ae5f12f373f8920d18c Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Tue, 2 May 2023 16:42:10 -0300 Subject: [PATCH 54/89] Fix bug when retrieve objs from db --- interface/settings.py | 2 +- main/models.py | 9 ++++---- main/views.py | 2 +- script.py | 30 +++++++++++++++++++++++++ src/schedule/schedule/config.py | 35 +++++++++++++++++++----------- src/schedule/schedule/constants.py | 6 +++++ src/schedule/schedule/job.py | 28 ++++++++++-------------- src/schedule/schedule/schedule.py | 21 +++++++++--------- 8 files changed, 85 insertions(+), 48 deletions(-) create mode 100644 script.py diff --git a/interface/settings.py b/interface/settings.py index 862e95ab..ef568ed4 100644 --- a/interface/settings.py +++ b/interface/settings.py @@ -44,7 +44,7 @@ SECRET_KEY = get_random_secret_key() # SECURITY WARNING: don't run with debug turned on in production! -DEBUG = env('DEBUG') +DEBUG = True#env('DEBUG') ALLOWED_HOSTS = env('DJANGO_ALLOWED_HOSTS') diff --git a/main/models.py b/main/models.py index fe3f0423..b3fea427 100644 --- a/main/models.py +++ b/main/models.py @@ -1,5 +1,4 @@ import datetime -from typing import List, Union from crawler_manager.constants import * from crawling_utils.constants import (AUTO_ENCODE_DETECTION, @@ -8,8 +7,8 @@ from django.db import models from django.db.models.base import ModelBase from django.utils import timezone -from typing_extensions import Literal, TypedDict -from schedule import SchedulerConfigDict, SchedulerConfig +from typing_extensions import Literal +from schedule.config import ConfigDict, Config CRAWLER_QUEUE_DB_ID = 1 @@ -650,7 +649,7 @@ class CrawlerQueueItem(TimeStamped): running = models.BooleanField(default=False, blank=True) position = models.IntegerField(null=False, default=0) -class TaskType(SchedulerConfigDict): +class TaskType(ConfigDict): id: int crawl_request: int crawler_queue_behavior: Literal['wait_on_last_queue_position', 'wait_on_first_queue_position', 'run_immediately'] @@ -675,7 +674,7 @@ def __str__(self): @property def next_run(self): - sched_config = SchedulerConfig() + sched_config = Config() sched_config.load_config(self.scheduler_config) if self.last_run is None: diff --git a/main/views.py b/main/views.py index 06476afa..1877ecb1 100644 --- a/main/views.py +++ b/main/views.py @@ -34,7 +34,7 @@ CrawlRequestSerializer, TaskSerializer) from .task_filter import task_filter_by_date_interval -from schedule.scheduler_config import SchedulerConfig +from schedule.config import Config as SchedulerConfig # Log the information to the file logger logger = logging.getLogger('file') diff --git a/script.py b/script.py new file mode 100644 index 00000000..800382fb --- /dev/null +++ b/script.py @@ -0,0 +1,30 @@ +import time +from datetime import datetime +from schedule.schedule import Schedule + +def funct(*args, **kwargs): + now = datetime.now() + + print(f'Now is {now}') + print(f'Args: {args}') + print(f'Kwargs: {kwargs}') + +if __name__ == '__main__': + sched_config = { + 'start_date': '02-05-2023T15:33', + 'timezone': 'America/Sao_Paulo', + 'repeat_mode': 'minutely' + } + + schedule = Schedule(connect_db=True, + db_host='localhost', + db_port=5432, + db_user='django', + db_pass='c01_password', + db_db='c01_prod',) + + # schedule.schedule_job(sched_config, funct, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, b=4, c=5, d=6, e=7, f=8, g=9, h=10) + + while True: + schedule.run_pending() + time.sleep(1) \ No newline at end of file diff --git a/src/schedule/schedule/config.py b/src/schedule/schedule/config.py index da304ff2..bfb9c4ea 100644 --- a/src/schedule/schedule/config.py +++ b/src/schedule/schedule/config.py @@ -44,7 +44,7 @@ class Config(SQL_ALCHEMY_BASE): timezone: str = Column(String, nullable=True) # 0: cancel task, 1: re-schedule task for next valid run, 2: execute task now - behavior_after_system_restart: int = Column(Integer, default=0) + behavior_after_system_restart: int = Column(Integer, default=DEFAULT_BEHAVIOR_AFTER_SYSTEM_RESTART) repeat_mode: str = Column(String, default=NO_REPEAT_MODE) repeat_interval: int = Column(Integer, default=1) @@ -67,10 +67,8 @@ class Config(SQL_ALCHEMY_BASE): monthly_first_weekday: Optional[int] = Column(Integer, default=None) monthly_last_weekday: Optional[int] = Column(Integer, default=None) - def __init__(self, db_session=None): + def __init__(self): super().__init__() - - self.db_session = db_session self.repeat_interval = 1 def __eq__(self, other): @@ -87,15 +85,12 @@ def __eq__(self, other): self.monthly_first_weekday == other.monthly_first_weekday and \ self.monthly_last_weekday == other.monthly_last_weekday - def save(self): + def save(self, db_session): ''' Saves the config to the database. ''' - if self.db_session is None: - return - - self.db_session.add(self) - self.db_session.commit() + db_session.add(self) + db_session.commit() def first_run_date(self) -> datetime.datetime: ''' @@ -116,12 +111,26 @@ def first_run_date(self) -> datetime.datetime: elif self.repeat_mode == MINUTELY_REPEAT_MODE: # Must consider the hour of start date - raise NotImplementedError() + if now < start_date: + return start_date + + # TODO: Make this more efficient + while start_date < self.now(): + start_date += datetime.timedelta(minutes=repeat_interval) + return start_date + if self.repeat_mode == HOURLY_REPEAT_MODE: # Must consider the hour of start date - raise NotImplementedError() - + if now < start_date: + return start_date + + # TODO: Make this more efficient + while start_date < self.now(): + start_date += datetime.timedelta(hours=repeat_interval) + + return start_date + elif self.repeat_mode == DAILY_REPEAT_MODE: return start_date if now < start_date else start_date + datetime.timedelta(days=repeat_interval) diff --git a/src/schedule/schedule/constants.py b/src/schedule/schedule/constants.py index 37fa0381..e93ce452 100644 --- a/src/schedule/schedule/constants.py +++ b/src/schedule/schedule/constants.py @@ -1,4 +1,5 @@ import environ +from enum import Enum from sqlalchemy.orm import declarative_base SQL_ALCHEMY_BASE = declarative_base() @@ -6,6 +7,7 @@ POSTGRES_SCHED_CONFIG_TABLE_NAME=(str, 'sched_config'), POSTGRES_SCHED_JOB_TABLE_NAME=(str, 'sched_job')) + # SCHEDULE CONFIG NUM_DAYS_IN_WEEK = 7 @@ -45,6 +47,10 @@ '%d-%m-%Y %H:%M:%S', '%Y-%m-%d %H:%M', '%d-%m-%Y %H:%M', + '%Y-%m-%dT%H:%M:%S', + '%d-%m-%YT%H:%M:%S', + '%Y-%m-%dT%H:%M', + '%d-%m-%YT%H:%M', '%Y-%m-%d', '%d-%m-%Y', ) diff --git a/src/schedule/schedule/job.py b/src/schedule/schedule/job.py index 8605b720..7e851eab 100644 --- a/src/schedule/schedule/job.py +++ b/src/schedule/schedule/job.py @@ -12,7 +12,12 @@ from schedule.function_wrapper import FunctionWrapper logger = logging.getLogger('scheduler_job') -logger.setLevel(logging.DEBUG) + +# saving log in file +file_handler = logging.FileHandler('scheduler_job.log') +file_handler.setLevel(logging.DEBUG) + +# logger.setLevel(logging.DEBUG) class CancelledJob(object): """ @@ -44,7 +49,7 @@ class Job(SQL_ALCHEMY_BASE): job_funct = Column(PickleType, default=None, nullable=False) - def __init__(self, sched_config: Config, db_session = None) -> None: + def __init__(self, sched_config: Config) -> None: ''' Create a new job. @@ -54,8 +59,6 @@ def __init__(self, sched_config: Config, db_session = None) -> None: self.sched_config: Config = sched_config self.num_repeats: int = 0 - self.db_session = db_session - def __lt__(self, other: 'Job') -> bool: assert self.next_run is not None, "must run _schedule_next_run before" assert other.next_run is not None, "must run _schedule_next_run before" @@ -86,20 +89,15 @@ def do(self, job_func: Callable, *args, **kwargs): ''' self.job_funct = FunctionWrapper(job_func, *args, **kwargs) self._schedule_first_run() - self.save() - def save(self): + def save(self, db_session): ''' Save the job to the database. ''' + self.sched_config.save(db_session) - self.sched_config.save() - - if self.db_session is None: - return - - self.db_session.add(self) - self.db_session.commit() + db_session.add(self) + db_session.commit() def recover(self): ''' @@ -152,8 +150,6 @@ def run(self): return CancelledJob - logger.debug('Running job %s', self) - try: ret = self.exec_funct() @@ -217,11 +213,9 @@ def _schedule_next_run(self) -> None: Schedule the next run of the job. ''' self.next_run = self.sched_config.next_run_date(self.next_run) - # If the next run is overdue, we schedule the next run. # This can happen if the system is down for a long time while self.next_run is not None and self._is_overdue(self.next_run): - logger.debug(f'Job {self} is overdue. Scheduling next run.') self.next_run = self.sched_config.next_run_date(self.next_run) def _is_overdue(self, when: datetime.datetime) -> bool: diff --git a/src/schedule/schedule/schedule.py b/src/schedule/schedule/schedule.py index 096614c4..1987e618 100644 --- a/src/schedule/schedule/schedule.py +++ b/src/schedule/schedule/schedule.py @@ -4,8 +4,8 @@ from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker - -from schedule.constants import SQL_ALCHEMY_DB_SESSION, SQL_ALCHEMY_ENGINE, SQL_ALCHEMY_BASE +from sqlalchemy import null +from schedule.constants import SQL_ALCHEMY_BASE from schedule.config import ConfigDict, Config from schedule.job import Job, CancelledJob @@ -74,7 +74,7 @@ def run_pending(self) -> None: def _run_job(self, job: "Job") -> None: ret = job.run() - job.save() + job.save(self.db_session) if isinstance(ret, CancelledJob) or ret is CancelledJob: self.cancel_job(job) @@ -92,12 +92,13 @@ def schedule_job(self, sched_config_dict: ConfigDict, job_func: Callable, *job_a ''' logger.debug('Scheduling job "%s" %s %s', job_func.__name__, job_args, job_kwargs) - sched_config = Config(self.db_session) + sched_config = Config() sched_config.load_config(sched_config_dict) - new_job = Job(sched_config, self.db_session) + new_job = Job(sched_config) new_job.do(job_func, *job_args, **job_kwargs) - + new_job.save(self.db_session) + self.jobs.append(new_job) return new_job @@ -112,7 +113,7 @@ def cancel_job(self, job: Job, reason: str = None) -> None: logger.debug('Cancelling job "%s"', job) job.cancel(reason) - job.save() + job.save(self.db_session) self.jobs.remove(job) @@ -125,14 +126,12 @@ def _load_jobs_from_db(self) -> None: ''' logger.debug('Recovering jobs') - retrieved_jobs = SQL_ALCHEMY_DB_SESSION.query(Job).filter(Job.cancelled_at is None).all() + retrieved_jobs = self.db_session.query(Job).filter(Job.cancelled_at == null()).all() self.jobs = list() - for job in sorted(retrieved_jobs): job.recover() - - if not job.cancelled: + if not job.cancelled_at: self.jobs.append(job) def cancel_all_jobs(self) -> None: From c0c1fd255a970b7223a441ad7099f0f6fcdce5b8 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Wed, 3 May 2023 17:01:39 -0300 Subject: [PATCH 55/89] Fix bugs when the sched. job is recovered --- scheduler/src/scheduler.py | 14 ++++++---- src/schedule/schedule/job.py | 52 +++++++++++++++++++++--------------- 2 files changed, 39 insertions(+), 27 deletions(-) diff --git a/scheduler/src/scheduler.py b/scheduler/src/scheduler.py index 9c4c04de..da3ca93e 100644 --- a/scheduler/src/scheduler.py +++ b/scheduler/src/scheduler.py @@ -23,8 +23,8 @@ def run_crawler(crawler_id, action): print(f'[{datetime.now()}] [TC] Crawler {crawler_id} processed by schedule...') class Scheduler: - def __init__(self, jobs): - self.jobs = jobs + def __init__(self): + self.jobs = dict() self.scheduler = Schedule(connect_db=True, db_host=settings.DB_HOST, db_port=settings.DB_PORT, @@ -63,8 +63,7 @@ def __run_task_consumer(self): # print(f'[{datetime.now()}] [TC] {worker_name} Worker: Error processing task data: "{e}"') def _set_schedule_call_for_task(self, config_dict, task_id, crawler_id, behavior): - config = ScheduleConfig(config_dict) - job = self.scheduler.schedule_job(config, run_crawler, crawler_id=crawler_id, action=behavior) + job = self.scheduler.schedule_job(config_dict, run_crawler, crawler_id=crawler_id, action=behavior) self.jobs[task_id] = job def __process_task_data(self, data): @@ -93,4 +92,9 @@ def run(self): self.__create_task_consumer() while True: self.scheduler.run_pending() - sleep(1) \ No newline at end of file + sleep(1) + + +if __name__ == "__main__": + scheduler = Scheduler() + scheduler.run() \ No newline at end of file diff --git a/src/schedule/schedule/job.py b/src/schedule/schedule/job.py index 7e851eab..824f8469 100644 --- a/src/schedule/schedule/job.py +++ b/src/schedule/schedule/job.py @@ -12,12 +12,7 @@ from schedule.function_wrapper import FunctionWrapper logger = logging.getLogger('scheduler_job') - -# saving log in file -file_handler = logging.FileHandler('scheduler_job.log') -file_handler.setLevel(logging.DEBUG) - -# logger.setLevel(logging.DEBUG) +logger.setLevel(logging.DEBUG) class CancelledJob(object): """ @@ -99,7 +94,7 @@ def save(self, db_session): db_session.add(self) db_session.commit() - def recover(self): + def recover(self) -> Any: ''' Ensure that the job is scheduled to run again after a system restart. ''' @@ -110,10 +105,20 @@ def recover(self): self.cancel() elif self.sched_config.behavior_after_system_restart == RESCHEDULE_TASK_ON_RESTART: - self._schedule_next_run() + self._schedule_next_run(True) elif self.sched_config.behavior_after_system_restart == RUN_TASK_IMMEDIATELLY: - self.run() + try: + ret = self.exec_funct() + + except Exception as e: + logger.exception('Error running job %s in recovery mode.', self) + logger.debug(f'Cancelling job {self}.\n\tReason: Exception raised.') + self.cancel(f'Exception raised: {e}') + return CancelledJob + + self._schedule_next_run(True) + return ret else: raise ValueError(f'Invalid behavior_after_system_restart: {self.sched_config.behavior_after_system_restart}') @@ -145,9 +150,7 @@ def run(self): if self._is_overdue(self.sched_config.now()): logger.debug(f'Cancelling job {self}.\n\tReason: The job is overdue.') - self.cancel(f'The job is overdue.') - return CancelledJob try: @@ -156,26 +159,20 @@ def run(self): except Exception as e: logger.exception('Error running job %s', self) logger.debug(f'Cancelling job {self}.\n\tReason: Exception raised.') - self.cancel(f'Exception raised: {e}') - return CancelledJob self.num_repeats += 1 if self._achieved_max_repeats(): logger.debug(f'Cancelling job {self}.\n\tReason: Max repeats achieved ({self.cancel_after_max_repeats})') - self.cancel(f'Max repeats achieved ({self.cancel_after_max_repeats})') - return CancelledJob self.last_run = self.sched_config.now() if isinstance(ret, CancelJob) or ret is CancelJob: logger.debug(f'Cancelling job {self}.\n\tReason: CancelJob returned.') - self.cancel(f'CancelJob returned.') - return CancelledJob self._schedule_next_run() @@ -208,15 +205,26 @@ def _schedule_first_run(self) -> None: ''' self.next_run = self.sched_config.first_run_date() - def _schedule_next_run(self) -> None: + def _schedule_next_run(self, recovery_mode: bool = False) -> None: ''' Schedule the next run of the job. ''' self.next_run = self.sched_config.next_run_date(self.next_run) - # If the next run is overdue, we schedule the next run. - # This can happen if the system is down for a long time - while self.next_run is not None and self._is_overdue(self.next_run): - self.next_run = self.sched_config.next_run_date(self.next_run) + + if recovery_mode: + # If the next run is overdue, we schedule the next run. + # This can happen if the system is down for a long time + while True: + if self.next_run is None: + break + + if self._is_overdue(self.next_run): + break + + if self.sched_config.now() < self.next_run: + break + + self.next_run = self.sched_config.next_run_date(self.next_run) def _is_overdue(self, when: datetime.datetime) -> bool: ''' From bc5ce94cf86fd241f2fad30c187bc62ba3ff9479 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Thu, 4 May 2023 15:31:22 -0300 Subject: [PATCH 56/89] Integration of the new sched. lib with sched. container --- main/models.py | 16 ++------- main/serializers.py | 5 ++- main/views.py | 10 ++++++ scheduler/src/scheduler.py | 10 +++--- script.py | 43 +++++++++++++---------- src/schedule/schedule/function_wrapper.py | 16 +++++++-- src/schedule/schedule/job.py | 31 ++++++++++------ 7 files changed, 80 insertions(+), 51 deletions(-) diff --git a/main/models.py b/main/models.py index b3fea427..33c7d67f 100644 --- a/main/models.py +++ b/main/models.py @@ -662,23 +662,13 @@ class Task(TimeStamped): ] crawl_request = models.ForeignKey(CrawlRequest, on_delete=models.CASCADE, related_name='scheduler_jobs') + next_run = models.DateTimeField(null=True, blank=True) last_run = models.DateTimeField(null=True, blank=True) - + crawler_queue_behavior = models.CharField( max_length=32, choices=CRAWLER_QUEUE_BEHAVIOR_CHOICES, default='wait_on_last_queue_position') scheduler_config = models.JSONField() def __str__(self): - return f'{self.crawl_request} - {self.start_date}' - - @property - def next_run(self): - sched_config = Config() - sched_config.load_config(self.scheduler_config) - - if self.last_run is None: - return sched_config.first_run_date() - - return sched_config.next_run_date(self.last_run) - + return f'{self.crawl_request} - {self.start_date}' \ No newline at end of file diff --git a/main/serializers.py b/main/serializers.py index 8f229670..060e1e38 100644 --- a/main/serializers.py +++ b/main/serializers.py @@ -51,12 +51,11 @@ class Meta: class TaskSerializer(serializers.ModelSerializer): crawler_name = serializers.ReadOnlyField(source='crawl_request.source_name') - next_run = serializers.DateTimeField(read_only=True) class Meta: model = Task - read_only_fields = ['id', 'creation_date', 'last_modified'] + read_only_fields = ['id', 'creation_date', 'last_modified', 'next_run', 'last_run'] fields = ['id', 'creation_date', 'last_modified', 'crawl_request', - 'crawler_name', 'crawler_queue_behavior', 'last_run', + 'crawler_name', 'crawler_queue_behavior','last_run', 'scheduler_config', 'next_run'] diff --git a/main/views.py b/main/views.py index 1877ecb1..539c2d44 100644 --- a/main/views.py +++ b/main/views.py @@ -970,6 +970,16 @@ class CrawlerViewSet(viewsets.ModelViewSet): def run(self, request, pk): query_params = self.request.query_params.dict() action = query_params.get('action', '') + next_run = query_params.get('next_run') + + if next_run: + next_run = datetime.strptime(next_run, '%Y-%m-%d %H:%M:%S') + # next_run = pytz.timezone(settings.TIME_ZONE).localize(next_run) + + task = Task.objects.get(crawl_request__pk=pk) + task.next_run = next_run + task.last_run = datetime.now() + task.save() if action == 'run_immediately': wait_on = 'no_wait' diff --git a/scheduler/src/scheduler.py b/scheduler/src/scheduler.py index da3ca93e..ac2be9e8 100644 --- a/scheduler/src/scheduler.py +++ b/scheduler/src/scheduler.py @@ -3,8 +3,6 @@ import threading import ujson from schedule.schedule import Schedule -from schedule.config import Config as ScheduleConfig -from schedule.job import CancelJob import requests from kafka import KafkaConsumer @@ -18,9 +16,11 @@ UPDATE_TASK = "update" CREATE_TASK = "create" -def run_crawler(crawler_id, action): - SERVER_SESSION.get(settings.RUN_CRAWLER_URL + "/api/crawlers/{}/run?action={}".format(crawler_id, action)) - print(f'[{datetime.now()}] [TC] Crawler {crawler_id} processed by schedule...') +def run_crawler(crawler_id, action, next_run): + SERVER_SESSION.get(settings.RUN_CRAWLER_URL + \ + "/api/crawlers/{}/run?action={}&next_run={}".format(crawler_id, action, next_run)) + + print(f'[{datetime.now()}] [TC] Crawler {crawler_id} processed by schedule. \n\tAction: {action} \n\tNext run: {next_run}') class Scheduler: def __init__(self): diff --git a/script.py b/script.py index 800382fb..39d150e1 100644 --- a/script.py +++ b/script.py @@ -1,30 +1,37 @@ import time from datetime import datetime from schedule.schedule import Schedule +import inspect -def funct(*args, **kwargs): +def funct(next_run: datetime = None): now = datetime.now() print(f'Now is {now}') - print(f'Args: {args}') - print(f'Kwargs: {kwargs}') + + sleep_secs = 130 + for i in range(sleep_secs): + print(f'Sleeping for {sleep_secs - i} seconds...') + time.sleep(1) if __name__ == '__main__': - sched_config = { - 'start_date': '02-05-2023T15:33', - 'timezone': 'America/Sao_Paulo', - 'repeat_mode': 'minutely' - } + args = inspect.getfullargspec(funct).args + print(args) + # sched_config = { + # 'start_date': '03-05-2023T15:32', + # 'timezone': 'America/Sao_Paulo', + # 'repeat_mode': 'minutely' + # } - schedule = Schedule(connect_db=True, - db_host='localhost', - db_port=5432, - db_user='django', - db_pass='c01_password', - db_db='c01_prod',) + # schedule = Schedule(connect_db=True, + # db_host='localhost', + # db_port=5432, + # db_user='django', + # db_pass='c01_password', + # db_db='c01_prod',) - # schedule.schedule_job(sched_config, funct, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, b=4, c=5, d=6, e=7, f=8, g=9, h=10) + # schedule.schedule_job(sched_config, funct) - while True: - schedule.run_pending() - time.sleep(1) \ No newline at end of file + # while True: + # print('Checking for jobs...') + # schedule.run_pending() + # time.sleep(1) \ No newline at end of file diff --git a/src/schedule/schedule/function_wrapper.py b/src/schedule/schedule/function_wrapper.py index 6bc9f92c..f4582804 100644 --- a/src/schedule/schedule/function_wrapper.py +++ b/src/schedule/schedule/function_wrapper.py @@ -1,11 +1,18 @@ +from datetime import datetime from typing import Any, Callable, Dict, List, Any +import inspect + class FunctionWrapper: def __init__(self, funct: Callable, *args, **kwargs): self.funct: Callable = funct self.args: List[Any] = list(args) self.kwargs: Dict[str, Any] = kwargs - def __call__(self) -> Any: + def __call__(self, next_run: datetime = None) -> Any: + # check if the funct accepts a next_run argument + if self.funct_requires_next_run() and next_run is not None: + self.kwargs["next_run"] = next_run + return self.funct(*self.args, **self.kwargs) def __repr__(self) -> str: @@ -18,4 +25,9 @@ def __eq__(self, other: "FunctionWrapper") -> bool: return self.funct == other.funct and self.args == other.args and self.kwargs == other.kwargs def __hash__(self) -> int: - return hash((self.funct, tuple(self.args), frozenset(self.kwargs.items()))) \ No newline at end of file + return hash((self.funct, tuple(self.args), frozenset(self.kwargs.items()))) + + def funct_requires_next_run(self) -> bool: + args_accept = inspect.getfullargspec(self.funct).args + return 'next_run' in args_accept + \ No newline at end of file diff --git a/src/schedule/schedule/job.py b/src/schedule/schedule/job.py index 824f8469..f1664fed 100644 --- a/src/schedule/schedule/job.py +++ b/src/schedule/schedule/job.py @@ -1,6 +1,7 @@ import logging import datetime +import inspect from typing import Callable, Any from sqlalchemy import Column, Integer, PickleType, DateTime, ForeignKey, String @@ -141,7 +142,11 @@ def exec_funct(self) -> Any: if self.job_funct is None: raise ValueError('job_func is None') - return self.job_funct() + next_run = None + if self.job_funct.funct_requires_next_run(): + next_run = self.get_next_run() + + return self.job_funct(next_run) def run(self): ''' @@ -205,26 +210,32 @@ def _schedule_first_run(self) -> None: ''' self.next_run = self.sched_config.first_run_date() - def _schedule_next_run(self, recovery_mode: bool = False) -> None: + def get_next_run(self, recovery_mode: bool = False) -> datetime.datetime: ''' - Schedule the next run of the job. - ''' - self.next_run = self.sched_config.next_run_date(self.next_run) + Get the next run of the job. + ''' + next_run = self.sched_config.next_run_date(self.next_run) if recovery_mode: - # If the next run is overdue, we schedule the next run. - # This can happen if the system is down for a long time while True: if self.next_run is None: break - if self._is_overdue(self.next_run): + if self._is_overdue(next_run): break - if self.sched_config.now() < self.next_run: + if self.sched_config.now() < next_run: break - self.next_run = self.sched_config.next_run_date(self.next_run) + next_run = self.sched_config.next_run_date(next_run) + + return next_run + + def _schedule_next_run(self, recovery_mode: bool = False) -> None: + ''' + Schedule the next run of the job. + ''' + self.next_run = self.get_next_run(recovery_mode) def _is_overdue(self, when: datetime.datetime) -> bool: ''' From 8f8737be9ee8c6997d6640698fd6d67951d24522 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Tue, 9 May 2023 16:11:33 -0300 Subject: [PATCH 57/89] Fix error passing wrong data to contAINER --- main/views.py | 6 +++--- scheduler/src/scheduler.py | 20 ++++++++++++++------ 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/main/views.py b/main/views.py index 539c2d44..247d1aea 100644 --- a/main/views.py +++ b/main/views.py @@ -1232,7 +1232,7 @@ def partial_update(self, request, pk=None): if response.status_code == status.HTTP_200_OK: data = response.data - schedule_data = { + schedule_config = { 'start_date': data.get('start_date'), 'timezone': data.get('timezone'), 'repeat_mode': data.get('repeat_mode'), @@ -1240,14 +1240,14 @@ def partial_update(self, request, pk=None): } task_data = { - 'id': data.get('id'), + 'id': int(data.get('id')), 'crawl_request': data.get('crawl_request'), 'crawler_queue_behavior': data.get('crawler_queue_behavior'), } message = { 'action': 'update', - 'schedule_data': schedule_data, + 'schedule_config': schedule_config, 'task_data': task_data } diff --git a/scheduler/src/scheduler.py b/scheduler/src/scheduler.py index ac2be9e8..a3d3673d 100644 --- a/scheduler/src/scheduler.py +++ b/scheduler/src/scheduler.py @@ -68,22 +68,30 @@ def _set_schedule_call_for_task(self, config_dict, task_id, crawler_id, behavior def __process_task_data(self, data): action = data['action'] - config_dict = data['schedule_config'] - - task_id = data['task_data']['id'] - crawler_id = data['task_data']['crawl_request'] - behavior = data['task_data']['crawler_queue_behavior'] + print(f'[{datetime.now()}] [TC] Jobs at start: {self.jobs}') if action == CANCEL_TASK: + task_id = int(data['id']) self.scheduler.cancel_job(self.jobs[task_id]) + return + + config_dict = data['schedule_config'] + task_id = int(data['task_data']['id']) + crawler_id = data['task_data']['crawl_request'] + behavior = data['task_data']['crawler_queue_behavior'] if action == UPDATE_TASK: self.scheduler.cancel_job(self.jobs[task_id]) self._set_schedule_call_for_task(config_dict, task_id, crawler_id, behavior) - if action == CREATE_TASK: + elif action == CREATE_TASK: self._set_schedule_call_for_task(config_dict, task_id, crawler_id, behavior) + else: + print(f'[{datetime.now()}] [TC] Unknown action: {action}') + + print(f'\t[{datetime.now()}] [TC] Jobs at end: {self.jobs}') + def __create_task_consumer(self): self.thread = threading.Thread(target=self.__run_task_consumer, daemon=True) self.thread.start() From 1000c4b07cf15d2fce33fa5872004433dbca5b25 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Thu, 11 May 2023 13:59:54 -0300 Subject: [PATCH 58/89] Improve the restart method of scheduled jobs --- main/views.py | 11 ++--- scheduler/src/scheduler.py | 60 +++++++++++++++++++++-- scheduler/src/settings.py | 3 ++ src/schedule/schedule/function_wrapper.py | 8 ++- src/schedule/schedule/job.py | 13 ++++- 5 files changed, 82 insertions(+), 13 deletions(-) diff --git a/main/views.py b/main/views.py index 247d1aea..0d7f8ae5 100644 --- a/main/views.py +++ b/main/views.py @@ -970,14 +970,13 @@ class CrawlerViewSet(viewsets.ModelViewSet): def run(self, request, pk): query_params = self.request.query_params.dict() action = query_params.get('action', '') - next_run = query_params.get('next_run') - if next_run: - next_run = datetime.strptime(next_run, '%Y-%m-%d %H:%M:%S') - # next_run = pytz.timezone(settings.TIME_ZONE).localize(next_run) + # check if there is a task for this crawler + task = Task.objects.filter(crawl_request__pk=pk).first() - task = Task.objects.get(crawl_request__pk=pk) - task.next_run = next_run + if task: + next_run = query_params.get('next_run') + task.next_run = datetime.strptime(next_run, '%Y-%m-%d %H:%M:%S') if next_run else None task.last_run = datetime.now() task.save() diff --git a/scheduler/src/scheduler.py b/scheduler/src/scheduler.py index a3d3673d..87edc60e 100644 --- a/scheduler/src/scheduler.py +++ b/scheduler/src/scheduler.py @@ -4,10 +4,10 @@ import ujson from schedule.schedule import Schedule import requests - +from requests.exceptions import ConnectionError from kafka import KafkaConsumer from coolname import generate_slug - + import settings SERVER_SESSION = requests.sessions.Session() @@ -16,15 +16,43 @@ UPDATE_TASK = "update" CREATE_TASK = "create" +MAX_ATTEMPTS = 3 +SLEEP_TIME = 5 + def run_crawler(crawler_id, action, next_run): - SERVER_SESSION.get(settings.RUN_CRAWLER_URL + \ - "/api/crawlers/{}/run?action={}&next_run={}".format(crawler_id, action, next_run)) + attempt = 0 + sleep_time = SLEEP_TIME + + url = settings.RUN_CRAWLER_URL + "/api/crawlers/{}/run?action={}".format(crawler_id, action) + if next_run: + url += "&next_run={}".format(next_run) + + while attempt < MAX_ATTEMPTS: + try: + resp = SERVER_SESSION.get(url) + + if resp.status_code != 200: + raise ConnectionError(f'[{datetime.now()}] [TC] Error running crawler {crawler_id}. \n\tServer response: {resp.text}') + + print(f'[{datetime.now()}] [TC] Crawler {crawler_id} processed by schedule. \n\tAction: {action} \n\tNext run: {next_run}\n\t Server response: {resp}') + break + + except Exception as e: + attempt += 1 + sleep_time *= attempt + + print(f'[{datetime.now()}] [TC] Error running crawler {crawler_id}.\n\tAttempt: {attempt}\n\tSleep time: {sleep_time}\n\tReason: {e}') + sleep(sleep_time) + + continue - print(f'[{datetime.now()}] [TC] Crawler {crawler_id} processed by schedule. \n\tAction: {action} \n\tNext run: {next_run}') + if attempt == MAX_ATTEMPTS: + print(f'[{datetime.now()}] [TC] Error running crawler {crawler_id}. \n\tMax attempts reached.') class Scheduler: def __init__(self): self.jobs = dict() + self._lock_until_server_up() self.scheduler = Schedule(connect_db=True, db_host=settings.DB_HOST, db_port=settings.DB_PORT, @@ -32,6 +60,28 @@ def __init__(self): db_pass=settings.DB_PASS, db_db=settings.DB_DB) + def _lock_until_server_up(self): + ''' + Lock the scheduler until the server is up. + ''' + print(f'[{datetime.now()}] [TC] Waiting for server to be up...') + + time_waited = 0 + while time_waited < settings.MAX_WAIT_TIME: + try: + SERVER_SESSION.get(settings.RUN_CRAWLER_URL) + break + + except: + time_waited += settings.WAIT_TIME + sleep(settings.WAIT_TIME) + continue + + if time_waited == settings.MAX_WAIT_TIME: + raise ConnectionError(f'[{datetime.now()}] [TC] Server is down. \n\tMax wait time reached.') + + print(f'[{datetime.now()}] [TC] Server is up.') + def __run_task_consumer(self): # Generates a random name for the consumer worker_name = generate_slug(2).capitalize() diff --git a/scheduler/src/settings.py b/scheduler/src/settings.py index fb598935..114b0be2 100644 --- a/scheduler/src/settings.py +++ b/scheduler/src/settings.py @@ -16,6 +16,9 @@ TASK_DATA_CONSUMER_GROUP = os.getenv('TASK_DATA_CONSUMER_DATA', KAFKA_TOPIC_PREFIX + '.task_data_group') RUN_CRAWLER_URL = "http://web:8000" +WAIT_TIME = 1 +MAX_WAIT_TIME = 60 + DB_HOST = os.getenv('DB_HOST', 'db') DB_PORT = os.getenv('DB_PORT', '5432') DB_USER = os.getenv('POSTGRES_USER', 'django') diff --git a/src/schedule/schedule/function_wrapper.py b/src/schedule/schedule/function_wrapper.py index f4582804..61b1cb89 100644 --- a/src/schedule/schedule/function_wrapper.py +++ b/src/schedule/schedule/function_wrapper.py @@ -10,7 +10,13 @@ def __init__(self, funct: Callable, *args, **kwargs): def __call__(self, next_run: datetime = None) -> Any: # check if the funct accepts a next_run argument - if self.funct_requires_next_run() and next_run is not None: + + print('-' * 15) + print(f'The funct {self.funct} requires next_run: {self.funct_requires_next_run()}') + print(f'next_run: {next_run}') + print('-' * 15) + + if self.funct_requires_next_run(): self.kwargs["next_run"] = next_run return self.funct(*self.args, **self.kwargs) diff --git a/src/schedule/schedule/job.py b/src/schedule/schedule/job.py index f1664fed..5e1724ea 100644 --- a/src/schedule/schedule/job.py +++ b/src/schedule/schedule/job.py @@ -8,7 +8,8 @@ from sqlalchemy.orm import relationship from schedule.constants import (ENV, SQL_ALCHEMY_BASE, CANCELL_TASK_ON_RESTART, - RESCHEDULE_TASK_ON_RESTART, RUN_TASK_IMMEDIATELLY) + RESCHEDULE_TASK_ON_RESTART, RUN_TASK_IMMEDIATELLY, + NO_REPEAT_MODE) from schedule.config import Config from schedule.function_wrapper import FunctionWrapper @@ -102,10 +103,16 @@ def recover(self) -> Any: # Pending task during idle time if self.next_run < self.sched_config.now(): + if self.sched_config.behavior_after_system_restart == CANCELL_TASK_ON_RESTART: self.cancel() elif self.sched_config.behavior_after_system_restart == RESCHEDULE_TASK_ON_RESTART: + if self.sched_config.repeat_mode == NO_REPEAT_MODE: + print(f'Cancelling job {self}.\n\tReason: Job is overdue and has no repeat mode.') + self.cancel(f'Job {self} is overdue and has no repeat mode.') + return CancelledJob + self._schedule_next_run(True) elif self.sched_config.behavior_after_system_restart == RUN_TASK_IMMEDIATELLY: @@ -118,6 +125,9 @@ def recover(self) -> Any: self.cancel(f'Exception raised: {e}') return CancelledJob + if self.sched_config.repeat_mode == NO_REPEAT_MODE: + return CancelledJob + self._schedule_next_run(True) return ret @@ -145,6 +155,7 @@ def exec_funct(self) -> Any: next_run = None if self.job_funct.funct_requires_next_run(): next_run = self.get_next_run() + print(f'The job function {self.job_funct} requires the next run time: {next_run}') return self.job_funct(next_run) From 21654cb55e560222971d6d3f58314501b4ba1b3b Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Wed, 17 May 2023 15:59:35 -0300 Subject: [PATCH 59/89] Fix bugs when delete/remove schedulings --- main/views.py | 9 ++------- scheduler/src/scheduler.py | 15 +++++++++++++-- src/schedule/schedule/config.py | 4 ++++ src/schedule/schedule/job.py | 8 ++++++++ src/schedule/schedule/schedule.py | 5 ++++- 5 files changed, 31 insertions(+), 10 deletions(-) diff --git a/main/views.py b/main/views.py index 0d7f8ae5..bc570fac 100644 --- a/main/views.py +++ b/main/views.py @@ -1204,12 +1204,7 @@ def update(self, request, pk=None): if response.status_code == status.HTTP_200_OK: data = response.data - schedule_data = { - 'start_date': data.get('start_date'), - 'timezone': data.get('timezone'), - 'repeat_mode': data.get('repeat_mode'), - 'personalized_repeat': data.get('personalized_repeat') - } + schedule_data = data['scheduler_config'] task_data = { 'id': data.get('id'), @@ -1219,7 +1214,7 @@ def update(self, request, pk=None): message = { 'action': 'update', - 'schedule_data': schedule_data, + 'schedule_config': schedule_data, 'task_data': task_data } diff --git a/scheduler/src/scheduler.py b/scheduler/src/scheduler.py index 87edc60e..ced6ca48 100644 --- a/scheduler/src/scheduler.py +++ b/scheduler/src/scheduler.py @@ -116,13 +116,24 @@ def _set_schedule_call_for_task(self, config_dict, task_id, crawler_id, behavior job = self.scheduler.schedule_job(config_dict, run_crawler, crawler_id=crawler_id, action=behavior) self.jobs[task_id] = job + def _remove_task(self, task_id: int): + if task_id not in self.jobs: + return + + self.scheduler.cancel_job(self.jobs[task_id], reason='User request.', remove_from_db=True) + del self.jobs[task_id] + def __process_task_data(self, data): action = data['action'] print(f'[{datetime.now()}] [TC] Jobs at start: {self.jobs}') + print('-' * 35) + print(f'[{datetime.now()}] [TC] data: {data}') + print('-' * 35) + if action == CANCEL_TASK: task_id = int(data['id']) - self.scheduler.cancel_job(self.jobs[task_id]) + self._remove_task(task_id) return config_dict = data['schedule_config'] @@ -131,7 +142,7 @@ def __process_task_data(self, data): behavior = data['task_data']['crawler_queue_behavior'] if action == UPDATE_TASK: - self.scheduler.cancel_job(self.jobs[task_id]) + self._remove_task(task_id) self._set_schedule_call_for_task(config_dict, task_id, crawler_id, behavior) elif action == CREATE_TASK: diff --git a/src/schedule/schedule/config.py b/src/schedule/schedule/config.py index bfb9c4ea..43f4c6a8 100644 --- a/src/schedule/schedule/config.py +++ b/src/schedule/schedule/config.py @@ -92,6 +92,10 @@ def save(self, db_session): db_session.add(self) db_session.commit() + def delete(self, db_session): + db_session.delete(self) + db_session.commit() + def first_run_date(self) -> datetime.datetime: ''' Calculates the first run date based on the config. diff --git a/src/schedule/schedule/job.py b/src/schedule/schedule/job.py index 5e1724ea..e8d4e6da 100644 --- a/src/schedule/schedule/job.py +++ b/src/schedule/schedule/job.py @@ -95,6 +95,14 @@ def save(self, db_session): db_session.add(self) db_session.commit() + + def delete(self, db_session): + ''' + Delete the job from the database. + ''' + self.sched_config.delete(db_session) + db_session.delete(self) + db_session.commit() def recover(self) -> Any: ''' diff --git a/src/schedule/schedule/schedule.py b/src/schedule/schedule/schedule.py index 1987e618..83a41be2 100644 --- a/src/schedule/schedule/schedule.py +++ b/src/schedule/schedule/schedule.py @@ -103,7 +103,7 @@ def schedule_job(self, sched_config_dict: ConfigDict, job_func: Callable, *job_a return new_job - def cancel_job(self, job: Job, reason: str = None) -> None: + def cancel_job(self, job: Job, reason: str = None, remove_from_db: bool = False) -> None: ''' Delete a scheduled job. @@ -117,6 +117,9 @@ def cancel_job(self, job: Job, reason: str = None) -> None: self.jobs.remove(job) + if remove_from_db: + job.delete(self.db_session) + except ValueError: logger.debug('Cancelling not-scheduled job "%s"', job) From 29dd4237eff2f5b7f6eca739a4c87286dc796d59 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Thu, 18 May 2023 11:41:46 -0300 Subject: [PATCH 60/89] Fix bug when scheduling task in weekly and monthly repeat mode --- main/staticfiles/js/scheduler/scheduler.js | 2 -- src/schedule/schedule/config.py | 29 ++++++++++++++-------- src/schedule/schedule/config_dict.py | 4 +-- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/main/staticfiles/js/scheduler/scheduler.js b/main/staticfiles/js/scheduler/scheduler.js index c8defc63..a1edb5f8 100644 --- a/main/staticfiles/js/scheduler/scheduler.js +++ b/main/staticfiles/js/scheduler/scheduler.js @@ -387,8 +387,6 @@ function valid_new_scheduling() { let year_month_day = scheduling_task.scheduler_config.start_date.split('T')[0].split('-') calendar.daily.active_day = new Date(parseInt(year_month_day[0]), parseInt(year_month_day[1]) - 1, parseInt(year_month_day[2])); - - calendar.daily.show(); } function str_to_date(runtime) { diff --git a/src/schedule/schedule/config.py b/src/schedule/schedule/config.py index 43f4c6a8..3e8ac99c 100644 --- a/src/schedule/schedule/config.py +++ b/src/schedule/schedule/config.py @@ -255,18 +255,15 @@ def load_config(self, config_dict: ConfigDict) -> None: self.repeat_mode = config_dict['repeat_mode'] self.behavior_after_system_restart = config_dict.get('behavior_after_system_restart', DEFAULT_BEHAVIOR_AFTER_SYSTEM_RESTART) - if config_dict['repeat_mode'] == PERSONALIZED_REPEAT_MODE: - self._parse_personalized_config(config_dict['personalized_repeat']) + if self.repeat_mode == WEEKLY_REPEAT_MODE: + self.weekdays_to_run = [(self.start_date.weekday() + 1) % 7] - def now(self) -> datetime.datetime: - ''' - Returns the current datetime. + if self.repeat_mode == MONTHLY_REPEAT_MODE: + self.monthly_repeat_mode = MONTHLY_DAY_X_OCCURRENCE_TYPE + self.monthly_day_x_ocurrence = self.start_date.day - :returns: The current datetime. - ''' - - timezone = pytz.timezone(self.timezone) - return datetime.datetime.now(timezone).replace(tzinfo=None) + if self.repeat_mode == PERSONALIZED_REPEAT_MODE: + self._parse_personalized_config(config_dict['personalized_repeat']) def _parse_personalized_config(self, config_dict: PersonalizedRepeat) -> None: ''' @@ -631,4 +628,14 @@ def valid_config(config_dict: ConfigDict) -> None: Config._valid_start_date_and_timezone(config_dict) Config._valid_repeat_mode(config_dict) Config._valid_behavior_after_system_restart(config_dict) - Config._valid_personalized_repeat_mode(config_dict) \ No newline at end of file + Config._valid_personalized_repeat_mode(config_dict) + + def now(self) -> datetime.datetime: + ''' + Returns the current datetime. + + :returns: The current datetime. + ''' + + timezone = pytz.timezone(self.timezone) + return datetime.datetime.now(timezone).replace(tzinfo=None) diff --git a/src/schedule/schedule/config_dict.py b/src/schedule/schedule/config_dict.py index 2e74eb16..eb025e1d 100644 --- a/src/schedule/schedule/config_dict.py +++ b/src/schedule/schedule/config_dict.py @@ -43,8 +43,8 @@ class PersonalizedRepeat(TypedDict): The attribute `mode` can be one of the following: - daily: The job will run every days. - - weekly: The job will run every weeks, on the days specified in . - - monthly: The job will run every months, on the days specified in . + - weekly: The job will run every weeks, on the days specified in . + - monthly: The job will run every months, on the days specified in . - yearly: The job will run every years. The attribute `interval` is the interval of the repetition. From 4823591fe5b37af7d66026bac12d197427ca3626 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Thu, 18 May 2023 11:48:26 -0300 Subject: [PATCH 61/89] Minor improvement --- scheduler/src/scheduler.py | 11 +++-------- src/schedule/schedule/schedule.py | 7 ++++--- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/scheduler/src/scheduler.py b/scheduler/src/scheduler.py index ced6ca48..678f7bb2 100644 --- a/scheduler/src/scheduler.py +++ b/scheduler/src/scheduler.py @@ -116,24 +116,19 @@ def _set_schedule_call_for_task(self, config_dict, task_id, crawler_id, behavior job = self.scheduler.schedule_job(config_dict, run_crawler, crawler_id=crawler_id, action=behavior) self.jobs[task_id] = job - def _remove_task(self, task_id: int): + def _remove_task(self, task_id: int, reason: str = None, remove_from_db: bool = True): if task_id not in self.jobs: return - self.scheduler.cancel_job(self.jobs[task_id], reason='User request.', remove_from_db=True) + self.scheduler.cancel_job(self.jobs[task_id], reason=reason, remove_from_db=remove_from_db) del self.jobs[task_id] def __process_task_data(self, data): action = data['action'] - print(f'[{datetime.now()}] [TC] Jobs at start: {self.jobs}') - - print('-' * 35) - print(f'[{datetime.now()}] [TC] data: {data}') - print('-' * 35) if action == CANCEL_TASK: task_id = int(data['id']) - self._remove_task(task_id) + self._remove_task(task_id, reason='Task canceled by user', remove_from_db=False) return config_dict = data['schedule_config'] diff --git a/src/schedule/schedule/schedule.py b/src/schedule/schedule/schedule.py index 83a41be2..edbc5102 100644 --- a/src/schedule/schedule/schedule.py +++ b/src/schedule/schedule/schedule.py @@ -111,15 +111,16 @@ def cancel_job(self, job: Job, reason: str = None, remove_from_db: bool = False) ''' try: logger.debug('Cancelling job "%s"', job) - - job.cancel(reason) - job.save(self.db_session) self.jobs.remove(job) if remove_from_db: job.delete(self.db_session) + else: + job.cancel(reason) + job.save(self.db_session) + except ValueError: logger.debug('Cancelling not-scheduled job "%s"', job) From a005e934350005396d413b735d11b910ac7c9610 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Thu, 18 May 2023 15:17:50 -0300 Subject: [PATCH 62/89] Endpoint to cancel task --- main/models.py | 2 + main/serializers.py | 2 +- main/staticfiles/js/scheduler/services.js | 14 +++++++ main/views.py | 51 ++++++++++++++++++----- scheduler/src/scheduler.py | 3 +- src/schedule/schedule/schedule.py | 4 +- 6 files changed, 62 insertions(+), 14 deletions(-) diff --git a/main/models.py b/main/models.py index 33c7d67f..72dfea1d 100644 --- a/main/models.py +++ b/main/models.py @@ -665,6 +665,8 @@ class Task(TimeStamped): next_run = models.DateTimeField(null=True, blank=True) last_run = models.DateTimeField(null=True, blank=True) + cancelled_at = models.DateTimeField(null=True, blank=True) + crawler_queue_behavior = models.CharField( max_length=32, choices=CRAWLER_QUEUE_BEHAVIOR_CHOICES, default='wait_on_last_queue_position') diff --git a/main/serializers.py b/main/serializers.py index 060e1e38..dcd37c46 100644 --- a/main/serializers.py +++ b/main/serializers.py @@ -58,4 +58,4 @@ class Meta: fields = ['id', 'creation_date', 'last_modified', 'crawl_request', 'crawler_name', 'crawler_queue_behavior','last_run', - 'scheduler_config', 'next_run'] + 'scheduler_config', 'next_run', 'cancelled_at'] diff --git a/main/staticfiles/js/scheduler/services.js b/main/staticfiles/js/scheduler/services.js index 858a6925..53ccd0d3 100644 --- a/main/staticfiles/js/scheduler/services.js +++ b/main/staticfiles/js/scheduler/services.js @@ -96,6 +96,20 @@ services.delete_task = function(task_id) { }); } +services.cancel_task = function(task_id) { + $.ajax({ + url: `/api/scheduler/tasks/${task_id}/cancel`, + type: 'post', + async: false, + success: function (data) { + update_view(); + }, + error: function (data) { + console.error(data.responseText); + } + }); +} + services.save_updated_scheduling = function (task_being_edited) { let task_id = task_being_edited.id; diff --git a/main/views.py b/main/views.py index bc570fac..1504aff7 100644 --- a/main/views.py +++ b/main/views.py @@ -1171,7 +1171,12 @@ def create(self, request): try: SchedulerConfig.valid_config(schedule_config) - + + sched_config = SchedulerConfig() + sched_config.load_config(schedule_config) + + next_run = sched_config.first_run_date() + except Exception as e: return Response({'message': str(e)}, status=status.HTTP_400_BAD_REQUEST) @@ -1180,17 +1185,22 @@ def create(self, request): if response.status_code == status.HTTP_201_CREATED: data = response.data - message = { - 'action': 'create', - 'schedule_config': schedule_config, - 'task_data': { - 'id': data['id'], - 'crawl_request': data['crawl_request'], - 'crawler_queue_behavior': data['crawler_queue_behavior'], + try: + task_created = Task.objects.get(pk=data['id']) + + task_created.next_run = next_run + task_created.save() + + message = { + 'action': 'create', + 'schedule_config': schedule_config, + 'task_data': { + 'id': data['id'], + 'crawl_request': data['crawl_request'], + 'crawler_queue_behavior': data['crawler_queue_behavior'], + } } - } - try: crawler_manager.message_sender.send(TASK_TOPIC, message) except Exception as e: @@ -1247,12 +1257,33 @@ def partial_update(self, request, pk=None): crawler_manager.message_sender.send(TASK_TOPIC, message) return response + + @action(detail=True, methods=['get']) + def cancel(self, request, pk=None): + try: + task = Task.objects.get(pk=pk) + + except ObjectDoesNotExist: + return Response(status=status.HTTP_404_NOT_FOUND) + + task.cancelled_at = datetime.now() + task.save() + + message = { + 'action': 'cancel', + 'remove_from_db': False, + 'id': pk + } + + crawler_manager.message_sender.send(TASK_TOPIC, message) + return Response(status=status.HTTP_204_NO_CONTENT) def destroy(self, request, pk=None): response = super().destroy(request, pk=pk) if response.status_code == status.HTTP_204_NO_CONTENT: message = { 'action': 'cancel', + 'remove_from_db': True, 'id': pk } crawler_manager.message_sender.send(TASK_TOPIC, message) diff --git a/scheduler/src/scheduler.py b/scheduler/src/scheduler.py index 678f7bb2..61c96d04 100644 --- a/scheduler/src/scheduler.py +++ b/scheduler/src/scheduler.py @@ -128,7 +128,8 @@ def __process_task_data(self, data): if action == CANCEL_TASK: task_id = int(data['id']) - self._remove_task(task_id, reason='Task canceled by user', remove_from_db=False) + remove_from_db = data['remove_from_db'] + self._remove_task(task_id, reason='Task canceled by user', remove_from_db=remove_from_db) return config_dict = data['schedule_config'] diff --git a/src/schedule/schedule/schedule.py b/src/schedule/schedule/schedule.py index edbc5102..a49ae659 100644 --- a/src/schedule/schedule/schedule.py +++ b/src/schedule/schedule/schedule.py @@ -138,14 +138,14 @@ def _load_jobs_from_db(self) -> None: if not job.cancelled_at: self.jobs.append(job) - def cancel_all_jobs(self) -> None: + def cancel_all_jobs(self, reason: str = None) -> None: ''' Clear all scheduled jobs. ''' logger.debug('Cancelling all jobs') for job in self.jobs: - job.cancel() + job.cancel(reason) self.jobs.clear() From 426a095b1405c7d7bf54dbfb931749f9428e00e0 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Thu, 18 May 2023 15:37:05 -0300 Subject: [PATCH 63/89] Disable debbug settings --- crawler_manager/settings.py | 4 ++-- interface/settings.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/crawler_manager/settings.py b/crawler_manager/settings.py index bbbe5dab..81bf06e5 100644 --- a/crawler_manager/settings.py +++ b/crawler_manager/settings.py @@ -3,7 +3,7 @@ # Kafka host information KAFKA_TOPIC_PREFIX = os.getenv('KAFKA_TOPIC_PREFIX', 'crawler_ufmg') -KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'localhost:9092').split(',')] +KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'kafka:9092').split(',')] KAFKA_CONSUMER_AUTO_OFFSET_RESET = 'earliest' KAFKA_CONSUMER_TIMEOUT = 120000 KAFKA_CONSUMER_COMMIT_INTERVAL_MS = 5000 @@ -16,7 +16,7 @@ KAFKA_SESSION_TIMEOUT_MS = 2 * 60 * 1000 # Redis host information -REDIS_HOST = os.getenv('REDIS_HOST', 'localhost') +REDIS_HOST = os.getenv('REDIS_HOST', 'redis') REDIS_PORT = int(os.getenv('REDIS_PORT', 6379)) REDIS_DB = int(os.getenv('REDIS_DB', 0)) REDIS_PASSWORD = os.getenv('REDIS_PASSWORD', None) diff --git a/interface/settings.py b/interface/settings.py index ef568ed4..c92b72c7 100644 --- a/interface/settings.py +++ b/interface/settings.py @@ -23,7 +23,7 @@ # Initialize Django-environ to read settings from environment variables env = environ.Env( # set casting, default value - DEBUG=(bool, True), + DEBUG=(bool, False), DJANGO_ALLOWED_HOSTS=(list, ['*']), LOG_TO_FILE=(bool, False), SQL_ENGINE=(str, "django.db.backends.sqlite3"), @@ -44,7 +44,7 @@ SECRET_KEY = get_random_secret_key() # SECURITY WARNING: don't run with debug turned on in production! -DEBUG = True#env('DEBUG') +DEBUG = env('DEBUG') ALLOWED_HOSTS = env('DJANGO_ALLOWED_HOSTS') From 7cfac58892eccc784d874ebcee442f78636b4ab7 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Fri, 19 May 2023 12:18:26 -0300 Subject: [PATCH 64/89] Update test_scheduler_config.py --- src/schedule/tests/test_scheduler_config.py | 103 ++++++++++---------- 1 file changed, 51 insertions(+), 52 deletions(-) diff --git a/src/schedule/tests/test_scheduler_config.py b/src/schedule/tests/test_scheduler_config.py index 7df4f04b..7a0b3f3a 100644 --- a/src/schedule/tests/test_scheduler_config.py +++ b/src/schedule/tests/test_scheduler_config.py @@ -1,18 +1,17 @@ import datetime import unittest -from constants import * +from schedule.constants import * -from scheduler_config import (SchedulerConfigDict, - SchedulerConfig, - SchedulerConfigInvalidRepeatModeError, - SchedulerConfigMissingFieldError, - SchedulerConfigValueError, - REQUIRED_FIELDS) +from schedule.schedule.config_dict import ConfigDict +from schedule.schedule.config_dict import (Config, + ConfigInvalidRepeatModeError, + ConfigMissingFieldError, + ConfigValueError) class TestSchedulerConfig(unittest.TestCase): def setUp(self) -> None: - self.config_dict: SchedulerConfigDict = {k: None for k in REQUIRED_FIELDS} + self.config_dict: ConfigDict = {k: None for k in REQUIRED_FIELDS} self._fill_start_date() def _fill_start_date(self): @@ -30,12 +29,12 @@ def _fill_personalized_repeat(self): } def test_raise_exception_if_missing_required_fields(self): - with self.assertRaises(SchedulerConfigMissingFieldError): + with self.assertRaises(ConfigMissingFieldError): for req_field in REQUIRED_FIELDS: fields_with_missing_required_field = list(REQUIRED_FIELDS) fields_with_missing_required_field.remove(req_field) - config_dict: SchedulerConfigDict = {field: None for field in fields_with_missing_required_field} - SchedulerConfig.valid_config(config_dict) + config_dict: ConfigDict = {field: None for field in fields_with_missing_required_field} + Config.valid_config(config_dict) def test_raise_exception_with_invalid_start_date(self): now = datetime.datetime.now() @@ -48,89 +47,89 @@ def test_raise_exception_with_invalid_start_date(self): for invalid_input in (None, invalid_format_date, past_date_str): self.config_dict['start_date'] = invalid_input - with self.assertRaises(SchedulerConfigValueError): - SchedulerConfig.valid_config(self.config_dict) + with self.assertRaises(ConfigValueError): + Config.valid_config(self.config_dict) def test_raise_exception_with_invalid_repeat_mode(self): self.config_dict['repeat_mode'] = 'unknow_repeat_mode' - with self.assertRaises(SchedulerConfigInvalidRepeatModeError): - SchedulerConfig.valid_config(self.config_dict) + with self.assertRaises(ConfigInvalidRepeatModeError): + Config.valid_config(self.config_dict) def test_raise_if_personalized_repeat_value_is_not_dict(self): self.config_dict['repeat_mode'] = PERSONALIZED_REPEAT_MODE self.config_dict['personalized_repeat'] = None - with self.assertRaises(SchedulerConfigValueError): - SchedulerConfig.valid_config(self.config_dict) + with self.assertRaises(ConfigValueError): + Config.valid_config(self.config_dict) def test_raise_if_personalized_repeat_is_missing_required_fields(self): - with self.assertRaises(SchedulerConfigMissingFieldError): + with self.assertRaises(ConfigMissingFieldError): for req_field in PERSONALIZED_REQUIRED_FIELDS: fields_with_missing_required_field = list(PERSONALIZED_REQUIRED_FIELDS) fields_with_missing_required_field.remove(req_field) - config_dict: SchedulerConfigDict = {field: None for field in fields_with_missing_required_field} - SchedulerConfig.valid_config(config_dict) + config_dict: ConfigDict = {field: None for field in fields_with_missing_required_field} + Config.valid_config(config_dict) def test_raise_if_personalized_repeat_has_invalid_repeat_interval(self): self._fill_personalized_repeat() self.config_dict['personalized_repeat']['interval'] = '1' - with self.assertRaises(SchedulerConfigValueError): - SchedulerConfig.valid_config(self.config_dict) + with self.assertRaises(ConfigValueError): + Config.valid_config(self.config_dict) self.config_dict['personalized_repeat']['interval'] = -1 - with self.assertRaises(SchedulerConfigValueError): - SchedulerConfig.valid_config(self.config_dict) + with self.assertRaises(ConfigValueError): + Config.valid_config(self.config_dict) def test_raise_if_personalized_repeat_has_invalid_repeat_mode(self): self._fill_personalized_repeat() self.config_dict['personalized_repeat']['mode'] = 'unknow_mode' - with self.assertRaises(SchedulerConfigInvalidRepeatModeError): - SchedulerConfig.valid_config(self.config_dict) + with self.assertRaises(ConfigInvalidRepeatModeError): + Config.valid_config(self.config_dict) def test_raise_if_personalized_repeat_mode_has_invalid_weekly_config(self): self._fill_personalized_repeat() self.config_dict['personalized_repeat']['mode'] = WEEKLY_REPEAT_MODE - with self.assertRaises(SchedulerConfigValueError): - SchedulerConfig.valid_config(self.config_dict) + with self.assertRaises(ConfigValueError): + Config.valid_config(self.config_dict) self.config_dict['personalized_repeat']['data'] = [] - with self.assertRaises(SchedulerConfigValueError): - SchedulerConfig.valid_config(self.config_dict) + with self.assertRaises(ConfigValueError): + Config.valid_config(self.config_dict) self.config_dict['personalized_repeat']['data'] = [7] - with self.assertRaises(SchedulerConfigValueError): - SchedulerConfig.valid_config(self.config_dict) + with self.assertRaises(ConfigValueError): + Config.valid_config(self.config_dict) self.config_dict['personalized_repeat']['data'] = [-1] - with self.assertRaises(SchedulerConfigValueError): - SchedulerConfig.valid_config(self.config_dict) + with self.assertRaises(ConfigValueError): + Config.valid_config(self.config_dict) def test_raise_if_personalized_repeat_mode_has_invalid_monthly_config(self): self._fill_personalized_repeat() self.config_dict['personalized_repeat']['mode'] = MONTHLY_REPEAT_MODE - with self.assertRaises(SchedulerConfigValueError): - SchedulerConfig.valid_config(self.config_dict) + with self.assertRaises(ConfigValueError): + Config.valid_config(self.config_dict) self.config_dict['personalized_repeat']['data'] = {} - with self.assertRaises(SchedulerConfigMissingFieldError): - SchedulerConfig.valid_config(self.config_dict) + with self.assertRaises(ConfigMissingFieldError): + Config.valid_config(self.config_dict) required_fields = ['mode', 'value'] for req_field in required_fields: fields_with_missing_required_field = required_fields.copy() fields_with_missing_required_field.remove(req_field) self.config_dict['personalized_repeat']['data'] = {field: None for field in fields_with_missing_required_field} - with self.assertRaises(SchedulerConfigMissingFieldError): - SchedulerConfig.valid_config(self.config_dict) + with self.assertRaises(ConfigMissingFieldError): + Config.valid_config(self.config_dict) # Personalized monthly repeat type of type DAY-X must receive a integer in the field `value` of # the dict `data`, and must be between 1 and 31. @@ -139,8 +138,8 @@ def test_raise_if_personalized_repeat_mode_has_invalid_monthly_config(self): 'mode': MONTHLY_DAY_X_OCCURRENCE_TYPE, 'value':invalid_value } - with self.assertRaises(SchedulerConfigValueError): - SchedulerConfig.valid_config(self.config_dict) + with self.assertRaises(ConfigValueError): + Config.valid_config(self.config_dict) # Personalized monthly repeat type of type first-weekday or last-weekday must receive a integer in the field `value` of # the dict `data`, and must be between 0 and 6. @@ -150,24 +149,24 @@ def test_raise_if_personalized_repeat_mode_has_invalid_monthly_config(self): 'mode': mode, 'value':invalid_value } - with self.assertRaises(SchedulerConfigValueError): - SchedulerConfig.valid_config(self.config_dict) + with self.assertRaises(ConfigValueError): + Config.valid_config(self.config_dict) def test_raise_if_personalized_repeat_mode_finish_has_invalid_config(self): self._fill_personalized_repeat() self.config_dict['personalized_repeat']['finish'] = {} - with self.assertRaises(SchedulerConfigMissingFieldError): - SchedulerConfig.valid_config(self.config_dict) + with self.assertRaises(ConfigMissingFieldError): + Config.valid_config(self.config_dict) self.config_dict['personalized_repeat']['finish'] = { 'mode': 'unknown_mode', 'value': None } - with self.assertRaises(SchedulerConfigInvalidRepeatModeError): - SchedulerConfig.valid_config(self.config_dict) + with self.assertRaises(ConfigInvalidRepeatModeError): + Config.valid_config(self.config_dict) for invalid_input in ('-100', 0): self.config_dict['personalized_repeat']['finish'] = { @@ -175,8 +174,8 @@ def test_raise_if_personalized_repeat_mode_finish_has_invalid_config(self): 'value': invalid_input } - with self.assertRaises(SchedulerConfigValueError): - SchedulerConfig.valid_config(self.config_dict) + with self.assertRaises(ConfigValueError): + Config.valid_config(self.config_dict) now = datetime.datetime.now() @@ -192,5 +191,5 @@ def test_raise_if_personalized_repeat_mode_finish_has_invalid_config(self): 'value': invalid_input } - with self.assertRaises(SchedulerConfigValueError): - SchedulerConfig.valid_config(self.config_dict) + with self.assertRaises(ConfigValueError): + Config.valid_config(self.config_dict) From 6fd4544eb37180f00a399f65bc93b8be70230f52 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Fri, 19 May 2023 12:25:06 -0300 Subject: [PATCH 65/89] Update test_scheduler_config.py --- src/schedule/tests/test_scheduler_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/schedule/tests/test_scheduler_config.py b/src/schedule/tests/test_scheduler_config.py index 7a0b3f3a..baca874b 100644 --- a/src/schedule/tests/test_scheduler_config.py +++ b/src/schedule/tests/test_scheduler_config.py @@ -3,8 +3,8 @@ from schedule.constants import * -from schedule.schedule.config_dict import ConfigDict -from schedule.schedule.config_dict import (Config, +from schedule.config_dict import ConfigDict +from schedule.config_dict import (Config, ConfigInvalidRepeatModeError, ConfigMissingFieldError, ConfigValueError) From 8547b0fdf004ea36abd2134217e3d95202cc29e3 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Fri, 19 May 2023 12:28:16 -0300 Subject: [PATCH 66/89] Update test_scheduler_config.py --- src/schedule/tests/test_scheduler_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/schedule/tests/test_scheduler_config.py b/src/schedule/tests/test_scheduler_config.py index baca874b..bcf2f975 100644 --- a/src/schedule/tests/test_scheduler_config.py +++ b/src/schedule/tests/test_scheduler_config.py @@ -4,7 +4,7 @@ from schedule.constants import * from schedule.config_dict import ConfigDict -from schedule.config_dict import (Config, +from schedule.config import (Config, ConfigInvalidRepeatModeError, ConfigMissingFieldError, ConfigValueError) From e46fd9dc5abb36c68e4cd0ddce476d8fc24c7e9a Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Mon, 22 May 2023 10:59:30 -0300 Subject: [PATCH 67/89] Fix tests --- src/schedule/schedule/job.py | 4 +- src/schedule/tests/test_config.py | 47 +++++++------ src/schedule/tests/test_date_utils.py | 74 ++++++++++----------- src/schedule/tests/test_job.py | 38 ++++++----- src/schedule/tests/test_scheduler_config.py | 41 ++++++------ 5 files changed, 104 insertions(+), 100 deletions(-) diff --git a/src/schedule/schedule/job.py b/src/schedule/schedule/job.py index e8d4e6da..ec86f51d 100644 --- a/src/schedule/schedule/job.py +++ b/src/schedule/schedule/job.py @@ -165,6 +165,8 @@ def exec_funct(self) -> Any: next_run = self.get_next_run() print(f'The job function {self.job_funct} requires the next run time: {next_run}') + self.last_run = self.sched_config.now() + return self.job_funct(next_run) def run(self): @@ -192,8 +194,6 @@ def run(self): self.cancel(f'Max repeats achieved ({self.cancel_after_max_repeats})') return CancelledJob - self.last_run = self.sched_config.now() - if isinstance(ret, CancelJob) or ret is CancelJob: logger.debug(f'Cancelling job {self}.\n\tReason: CancelJob returned.') self.cancel(f'CancelJob returned.') diff --git a/src/schedule/tests/test_config.py b/src/schedule/tests/test_config.py index b97d4dbb..305b5ce2 100644 --- a/src/schedule/tests/test_config.py +++ b/src/schedule/tests/test_config.py @@ -38,7 +38,7 @@ def _fill_personalized_repeat(self): } def test_raise_exception_if_missing_required_fields(self): - with self.assertRaises(ConfigMissingFieldError): + with self.assertRaises(ConfigMissingFieldError, msg='Should raise exception if missing required fields'): for req_field in REQUIRED_FIELDS: fields_with_missing_required_field = list(REQUIRED_FIELDS) fields_with_missing_required_field.remove(req_field) @@ -56,12 +56,12 @@ def test_raise_exception_with_invalid_start_date(self): for invalid_input in (None, invalid_format_date, past_date_str): self.config_dict['start_date'] = invalid_input - with self.assertRaises(ConfigValueError): + with self.assertRaises(ConfigValueError, msg=f'Should raise exception if start date is {invalid_input}'): Config.valid_config(self.config_dict) def test_raise_exception_with_invalid_repeat_mode(self): self.config_dict['repeat_mode'] = 'unknow_repeat_mode' - with self.assertRaises(ConfigInvalidRepeatModeError): + with self.assertRaises(ConfigInvalidRepeatModeError, msg='Should raise exception if repeat mode is invalid'): Config.valid_config(self.config_dict) def test_raise_if_personalized_repeat_value_is_not_dict(self): @@ -71,7 +71,7 @@ def test_raise_if_personalized_repeat_value_is_not_dict(self): Config.valid_config(self.config_dict) def test_raise_if_personalized_repeat_is_missing_required_fields(self): - with self.assertRaises(ConfigMissingFieldError): + with self.assertRaises(ConfigMissingFieldError, msg='Should raise exception if missing required fields'): for req_field in PERSONALIZED_REQUIRED_FIELDS: fields_with_missing_required_field = list(PERSONALIZED_REQUIRED_FIELDS) fields_with_missing_required_field.remove(req_field) @@ -83,11 +83,11 @@ def test_raise_if_personalized_repeat_has_invalid_repeat_interval(self): self.config_dict['personalized_repeat']['interval'] = '1' - with self.assertRaises(ConfigValueError): + with self.assertRaises(ConfigValueError, msg='Should raise exception if interval is not a int'): Config.valid_config(self.config_dict) self.config_dict['personalized_repeat']['interval'] = -1 - with self.assertRaises(ConfigValueError): + with self.assertRaises(ConfigValueError, msg='Should raise exception if interval is not greater than 0'): Config.valid_config(self.config_dict) def test_raise_if_personalized_repeat_has_invalid_repeat_mode(self): @@ -95,7 +95,7 @@ def test_raise_if_personalized_repeat_has_invalid_repeat_mode(self): self.config_dict['personalized_repeat']['mode'] = 'unknow_mode' - with self.assertRaises(ConfigInvalidRepeatModeError): + with self.assertRaises(ConfigInvalidRepeatModeError, msg='Should raise exception if mode is invalid'): Config.valid_config(self.config_dict) def test_raise_if_personalized_repeat_mode_has_invalid_weekly_config(self): @@ -103,33 +103,33 @@ def test_raise_if_personalized_repeat_mode_has_invalid_weekly_config(self): self.config_dict['personalized_repeat']['mode'] = WEEKLY_REPEAT_MODE - with self.assertRaises(ConfigValueError): + with self.assertRaises(ConfigValueError, msg='Should raise exception if data is not a list'): Config.valid_config(self.config_dict) self.config_dict['personalized_repeat']['data'] = [] - with self.assertRaises(ConfigValueError): + with self.assertRaises(ConfigValueError, msg='Should raise exception if data is empty'): Config.valid_config(self.config_dict) self.config_dict['personalized_repeat']['data'] = [7] - with self.assertRaises(ConfigValueError): + with self.assertRaises(ConfigValueError, msg='Should raise exception if data has invalid values'): Config.valid_config(self.config_dict) self.config_dict['personalized_repeat']['data'] = [-1] - with self.assertRaises(ConfigValueError): + with self.assertRaises(ConfigValueError, msg='Should raise exception if data has invalid values'): Config.valid_config(self.config_dict) def test_raise_if_personalized_repeat_mode_has_invalid_monthly_config(self): self._fill_personalized_repeat() self.config_dict['personalized_repeat']['mode'] = MONTHLY_REPEAT_MODE - with self.assertRaises(ConfigValueError): + with self.assertRaises(ConfigValueError, msg='Should raise exception if data is not a dict'): Config.valid_config(self.config_dict) self.config_dict['personalized_repeat']['data'] = {} - with self.assertRaises(ConfigMissingFieldError): + with self.assertRaises(ConfigMissingFieldError, msg='Should raise exception if data is empty'): Config.valid_config(self.config_dict) required_fields = ['mode', 'value'] @@ -137,7 +137,7 @@ def test_raise_if_personalized_repeat_mode_has_invalid_monthly_config(self): fields_with_missing_required_field = required_fields.copy() fields_with_missing_required_field.remove(req_field) self.config_dict['personalized_repeat']['data'] = {field: None for field in fields_with_missing_required_field} - with self.assertRaises(ConfigMissingFieldError): + with self.assertRaises(ConfigMissingFieldError, msg=f'Should raise exception if missing required fields'): Config.valid_config(self.config_dict) # Personalized monthly repeat type of type DAY-X must receive a integer in the field `value` of @@ -147,7 +147,7 @@ def test_raise_if_personalized_repeat_mode_has_invalid_monthly_config(self): 'mode': MONTHLY_DAY_X_OCCURRENCE_TYPE, 'value':invalid_value } - with self.assertRaises(ConfigValueError): + with self.assertRaises(ConfigValueError, msg=f'Should raise exception if value is {invalid_value}'): Config.valid_config(self.config_dict) # Personalized monthly repeat type of type first-weekday or last-weekday must receive a integer in the field `value` of @@ -158,7 +158,7 @@ def test_raise_if_personalized_repeat_mode_has_invalid_monthly_config(self): 'mode': mode, 'value':invalid_value } - with self.assertRaises(ConfigValueError): + with self.assertRaises(ConfigValueError, msg=f'Should raise exception if value is {invalid_value}'): Config.valid_config(self.config_dict) def test_raise_if_personalized_repeat_mode_finish_has_invalid_config(self): @@ -166,7 +166,7 @@ def test_raise_if_personalized_repeat_mode_finish_has_invalid_config(self): self.config_dict['personalized_repeat']['finish'] = {} - with self.assertRaises(ConfigMissingFieldError): + with self.assertRaises(ConfigMissingFieldError, msg='Should raise exception if missing required fields'): Config.valid_config(self.config_dict) self.config_dict['personalized_repeat']['finish'] = { @@ -174,7 +174,7 @@ def test_raise_if_personalized_repeat_mode_finish_has_invalid_config(self): 'value': None } - with self.assertRaises(ConfigInvalidRepeatModeError): + with self.assertRaises(ConfigInvalidRepeatModeError, msg='Should raise exception if mode is invalid'): Config.valid_config(self.config_dict) for invalid_input in ('-100', 0): @@ -183,7 +183,7 @@ def test_raise_if_personalized_repeat_mode_finish_has_invalid_config(self): 'value': invalid_input } - with self.assertRaises(ConfigValueError): + with self.assertRaises(ConfigValueError, msg=f'Should raise exception if value is {invalid_input}'): Config.valid_config(self.config_dict) now = datetime.datetime.now() @@ -200,7 +200,7 @@ def test_raise_if_personalized_repeat_mode_finish_has_invalid_config(self): 'value': invalid_input } - with self.assertRaises(ConfigValueError): + with self.assertRaises(ConfigValueError, msg=f'Should raise exception if value is {invalid_input}'): Config.valid_config(self.config_dict) def test_if_config_can_be_retrieved_from_db(self): @@ -208,12 +208,11 @@ def test_if_config_can_be_retrieved_from_db(self): Config.valid_config(self.config_dict) - config = Config(self.db_session) + config = Config() config.load_config(self.config_dict) - self.db_session.add(config) - self.db_session.commit() + config.save(self.db_session) config_from_db = self.db_session.query(Config).first() - self.assertTrue(config_from_db == config) \ No newline at end of file + self.assertTrue(config_from_db == config, msg='Should be equal to the config loaded from the dict') \ No newline at end of file diff --git a/src/schedule/tests/test_date_utils.py b/src/schedule/tests/test_date_utils.py index 77bf11c4..827c78da 100644 --- a/src/schedule/tests/test_date_utils.py +++ b/src/schedule/tests/test_date_utils.py @@ -8,7 +8,7 @@ def test_date_date_with_invalid_date(self): year, month, day = 2021, 2, 31 date = get_date(year, month, day) - self.assertEqual(date.day, 28) + self.assertEqual(date.day, 28, 'The day should be 28') def test_get_last_day_of_month(self): # get_last_day_of_month should return the last day of the month @@ -16,7 +16,7 @@ def test_get_last_day_of_month(self): year, month = 2021, 12 last_day = get_last_day_of_month(month, year) - self.assertEqual(last_day, 31) + self.assertEqual(last_day, 31, 'The last day should be 31') def test_get_first_weekday_date_of_month(self): # get_first_weekday_date_of_month should return the first weekday of the month @@ -24,7 +24,7 @@ def test_get_first_weekday_date_of_month(self): year, month, weekday = 2023, 3, 6 date = get_first_weekday_date_of_month(weekday, year, month) - self.assertEqual(date.day, 4) + self.assertEqual(date.day, 4, 'The first saturday of march of 2023 is 04/03/2023') def test_get_last_weekday_date_of_month(self): # get_last_weekday_date_of_month should return the last weekday of the month @@ -32,7 +32,7 @@ def test_get_last_weekday_date_of_month(self): year, month, weekday = 2023, 4, 0 date = get_last_weekday_date_of_month(weekday, year, month) - self.assertEqual(date.day, 30) + self.assertEqual(date.day, 30, 'The last sunday of april of 2023 is 30/04/2023') def test_weeks_next_execution_date(self): # weeks_next_execution_date should return the next execution date given a base date, a list of days of week and a interval between executions @@ -42,28 +42,28 @@ def test_weeks_next_execution_date(self): days_of_week = [3] interval = 1 date = weeks_next_execution_date(base_date, days_of_week, interval) - self.assertEqual(date.day, 8) + self.assertEqual(date.day, 8, 'The next execution date should be 08/02/2023') # Another example, the next execution date of 2023-02-01, given a list of days of week [0, 2] (sunday and tuesday), interval equal to 3 weeks is 19/02/2023. days_of_week = [0, 2] interval = 3 date = weeks_next_execution_date(base_date, days_of_week, interval) - self.assertEqual(date.day, 19) + self.assertEqual(date.day, 19, 'The next execution date should be 19/02/2023') # Another example, the next execution date of 2023-02-01, given a list of days of week [2, 6] (sunday and tuesday), interval equal to 2 weeks is 04/02/2023. days_of_week = [2, 6] interval = 2 date = weeks_next_execution_date(base_date, days_of_week, interval) - self.assertEqual(date.day, 4) + self.assertEqual(date.day, 4, 'The next execution date should be 04/02/2023') # But if the list of days is only [2], the next execution date should be 14/02/2023. days_of_week = [2] interval = 2 date = weeks_next_execution_date(base_date, days_of_week, interval) - self.assertEqual(date.day, 14) + self.assertEqual(date.day, 14, 'The next execution date should be 14/02/2023') def test_month_next_execution_date(self): # months_next_execution_date should return the next execution date given a base date, the type of execution (day of month or weekday of month) and a interval between executions @@ -75,8 +75,8 @@ def test_month_next_execution_date(self): date = month_next_execution_date(base_date, type_of_execution, interval) - self.assertEqual(date.day, 1) - self.assertEqual(date.month, 3) + self.assertEqual(date.day, 1, 'The next execution date should be 01/03/2023') + self.assertEqual(date.month, 3, 'The next execution date should be 01/03/2023') # If the the next month has not the same number of days of the base date, the next execution date should be the last day of the next month. # For example, the next execution date of 2023-01-01, given a type of execution equal to 'day-x', day_x = 31 and interval equal to 1 is 28/02/2023. @@ -89,8 +89,8 @@ def test_month_next_execution_date(self): date = month_next_execution_date(base_date, type_of_execution, day_x, interval=interval) - self.assertEqual(date.day, 28) - self.assertEqual(date.month, 2) + self.assertEqual(date.day, 28, 'The next execution date should be 28/02/2023') + self.assertEqual(date.month, 2, 'The next execution date should be 28/02/2023') # Another example, the next execution date of 2023-02-01, given a type of execution equal to 'fist-weekday', first_weekday_to_run = 3 (runs every first wednesday of month), interval equal to 2 is 05/04/2023. # Because the next execution date is the first wednesday of march. @@ -102,8 +102,8 @@ def test_month_next_execution_date(self): date = month_next_execution_date(base_date, type_of_execution, first_weekday_to_run=first_weekday_to_run, interval=interval) - self.assertEqual(date.day, 5) - self.assertEqual(date.month, 4) + self.assertEqual(date.day, 5, 'The next execution date should be 05/04/2023') + self.assertEqual(date.month, 4, 'The next execution date should be 05/04/2023') # Another example, the next execution date of 2023-02-01, given a type of execution equal to 'last-weekday', last_weekday_to_run = 1 (runs every last monday of month), interval equal to 7 is 25/09/2023. # Because the next execution date is the last monday of september. @@ -115,8 +115,8 @@ def test_month_next_execution_date(self): date = month_next_execution_date(base_date, type_of_execution, last_weekday_to_run=last_weekday_to_run, interval=interval) - self.assertEqual(date.day, 25) - self.assertEqual(date.month, 9) + self.assertEqual(date.day, 25, 'The next execution date should be 25/09/2023') + self.assertEqual(date.month, 9, 'The next execution date should be 25/09/2023') def test_year_next_execution_date(self): # year_next_execution_date should return the next execution date given a base date and a interval between executions @@ -125,9 +125,9 @@ def test_year_next_execution_date(self): base_date = datetime.datetime(2023, 2, 1) date = year_next_execution_date(base_date, interval=1) - self.assertEqual(date.day, 1) - self.assertEqual(date.month, 2) - self.assertEqual(date.year, 2024) + self.assertEqual(date.day, 1, 'The next execution date should be 01/02/2024') + self.assertEqual(date.month, 2, 'The next execution date should be 01/02/2024') + self.assertEqual(date.year, 2024, 'The next execution date should be 01/02/2024') # However, if the next year has not the same number of days of the base date, the next execution date should be the last day of the next year. # For example, the next execution date of 2020-02-29, interval equal to 3 is 28/02/2021. @@ -136,9 +136,9 @@ def test_year_next_execution_date(self): base_date = datetime.datetime(2020, 2, 29) date = year_next_execution_date(base_date, interval=3) - self.assertEqual(date.day, 28) - self.assertEqual(date.month, 2) - self.assertEqual(date.year, 2023) + self.assertEqual(date.day, 28, 'The next execution date should be 28/02/2021') + self.assertEqual(date.month, 2, 'The next execution date should be 28/02/2021') + self.assertEqual(date.year, 2023, 'The next execution date should be 28/02/2021') # Another example, the next execution date of 2020-02-29, interval equal to 4 is 29/02/2024. # Because the next year (2024) has 29 days. @@ -146,9 +146,9 @@ def test_year_next_execution_date(self): base_date = datetime.datetime(2020, 2, 29) date = year_next_execution_date(base_date, interval=4) - self.assertEqual(date.day, 29) - self.assertEqual(date.month, 2) - self.assertEqual(date.year, 2024) + self.assertEqual(date.day, 29, 'The next execution date should be 29/02/2024') + self.assertEqual(date.month, 2, 'The next execution date should be 29/02/2024') + self.assertEqual(date.year, 2024, 'The next execution date should be 29/02/2024') def test_decode_datetimestr(self): @@ -157,20 +157,20 @@ def test_decode_datetimestr(self): date = decode_datetimestr('2023-02-01 12:00:00') - self.assertEqual(date.day, 1) - self.assertEqual(date.month, 2) - self.assertEqual(date.year, 2023) - self.assertEqual(date.hour, 12) - self.assertEqual(date.minute, 0) - self.assertEqual(date.second, 0) + self.assertEqual(date.day, 1, 'The day should be 1') + self.assertEqual(date.month, 2, 'The month should be 2') + self.assertEqual(date.year, 2023, 'The year should be 2023') + self.assertEqual(date.hour, 12, 'The hour should be 12') + self.assertEqual(date.minute, 0, 'The minute should be 0') + self.assertEqual(date.second, 0, 'The second should be 0') # Another example, the datetime object of 2023-02-01 12:00:00 is 2023-02-01 12:00:00. date = decode_datetimestr('2023-02-01 12:00:00') - self.assertEqual(date.day, 1) - self.assertEqual(date.month, 2) - self.assertEqual(date.year, 2023) - self.assertEqual(date.hour, 12) - self.assertEqual(date.minute, 0) - self.assertEqual(date.second, 0) + self.assertEqual(date.day, 1, 'The day should be 1') + self.assertEqual(date.month, 2, 'The month should be 2') + self.assertEqual(date.year, 2023, 'The year should be 2023') + self.assertEqual(date.hour, 12, 'The hour should be 12') + self.assertEqual(date.minute, 0, 'The minute should be 0') + self.assertEqual(date.second, 0, 'The second should be 0') diff --git a/src/schedule/tests/test_job.py b/src/schedule/tests/test_job.py index d758a576..e5def18e 100644 --- a/src/schedule/tests/test_job.py +++ b/src/schedule/tests/test_job.py @@ -22,10 +22,10 @@ def setUp(self): 'timezone': 'America/Sao_Paulo', } - self.config = Config(self.session) + self.config = Config() self.config.load_config(self.config_dict) - self.job = Job(self.config, self.session) + self.job = Job(self.config) def test_check_if_can_retrieve_job_from_db(self): funct = FunctionWrapper(lambda s: s, 'test') @@ -36,19 +36,19 @@ def test_check_if_can_retrieve_job_from_db(self): job_from_db = self.session.query(Job).first() - self.assertTrue(job_from_db == self.job) + self.assertTrue(job_from_db == self.job, 'Job retrieved from db is not the same as the original job') def test_if_job_should_run_if_in_past(self): now = self.config.now() self.job.next_run = now - timedelta(seconds=1) - self.assertTrue(self.job.should_run) + self.assertTrue(self.job.should_run, 'Job should run if next_run is in the past') def test_if_job_should_run_if_in_future(self): now = self.config.now() self.job.next_run = now + timedelta(seconds=60) - self.assertFalse(self.job.should_run) + self.assertFalse(self.job.should_run, 'Job should not run if next_run is in the future') def test_if_job_should_run_if_now(self): now = self.config.now() @@ -58,7 +58,7 @@ def test_if_job_should_run_if_now(self): def test_if_job_exec_funct(self): self.job.job_funct = FunctionWrapper(lambda s: s, 'test') - self.assertEqual(self.job.exec_funct(), 'test') + self.assertEqual(self.job.exec_funct(), 'test', 'Job should return the function return value') def test_check_if_is_overdue(self): now = self.config.now() @@ -70,7 +70,7 @@ def test_first_run_date(self): self.job._schedule_first_run() - self.assertEqual(self.job.next_run, start_date) + self.assertEqual(self.job.next_run, start_date, 'The first run date should be the start_date') def test_next_run_date(self): start_date = datetime.strptime(self.config_dict['start_date'], VALID_DATETIME_FORMATS[0]) @@ -78,7 +78,7 @@ def test_next_run_date(self): self.job._schedule_first_run() self.job._schedule_next_run() - self.assertEqual(self.job.next_run, start_date + timedelta(days=1)) + self.assertEqual(self.job.next_run, start_date + timedelta(days=1), 'The next run date should be the start_date + 1 day') def test_cancel_job_after_restart(self): past_date = self.config.now() - timedelta(days=1) @@ -90,21 +90,25 @@ def test_cancel_job_after_restart(self): self.job.sched_config.behavior_after_system_restart = CANCELL_TASK_ON_RESTART self.job.recover() - self.assertTrue(self.job.cancelled_at is not None) + self.assertTrue(self.job.cancelled_at is not None, 'The job should be cancelled') def test_reschedule_job_after_restart(self): past_date = self.config.now() - timedelta(days=1) self.job.next_run = past_date # When the job is recovered, the next_run is in the past and the behavior_after_system_restart - # is set to RESCHEDULE_TASK_ON_RESTART, the next_run should be rescheduled to the current date + # is set to RESCHEDULE_TASK_ON_RESTART, the next_run should be rescheduled to the next day + # since that the repeat_mode is set to daily and the start_date will be in the past self.job.sched_config.behavior_after_system_restart = RESCHEDULE_TASK_ON_RESTART self.job.recover() - next_run = past_date + timedelta(days=1) + # The next must be rescheduled to the next day, since that the start_date is in the past + # and the task cannot be reescheduled to today because the now used before will be different in + # the reeschedule routine + next_run = past_date + timedelta(days=2) - self.assertEqual(self.job.next_run, next_run) + self.assertEqual(self.job.next_run, next_run, 'The next_run should be rescheduled to the next day') def test_run_job_immediatelly_after_restart(self): past_date = self.config.now() - timedelta(days=1) @@ -116,11 +120,11 @@ def test_run_job_immediatelly_after_restart(self): self.job.job_funct = FunctionWrapper(lambda s: s, 'test') self.job.sched_config.behavior_after_system_restart = RUN_TASK_IMMEDIATELLY - self.assertIsNone(self.job.last_run) + self.assertIsNone(self.job.last_run, 'The last_run should be None before the job run') self.job.recover() - self.assertIsNotNone(self.job.last_run) + self.assertIsNotNone(self.job.last_run, 'The last_run should not be None after the job run') def test_job_run(self): self.job.job_funct = FunctionWrapper(lambda s: s, 'test') @@ -130,7 +134,7 @@ def test_job_run(self): ret = self.job.run() - self.assertEqual(ret, 'test') + self.assertEqual(ret, 'test', 'The job should return the function return value') def test_count_number_of_runs(self): self.job.job_funct = FunctionWrapper(lambda s: s, 'test') @@ -140,7 +144,7 @@ def test_count_number_of_runs(self): self.job.run() - self.assertEqual(self.job.num_repeats, 1) + self.assertEqual(self.job.num_repeats, 1, 'The job should have run only once') def test_job_can_self_cancel(self): self.job.job_funct = FunctionWrapper(lambda: CancelJob) @@ -150,4 +154,4 @@ def test_job_can_self_cancel(self): self.job.run() - self.assertTrue(self.job.cancelled_at is not None) \ No newline at end of file + self.assertTrue(self.job.cancelled_at is not None, 'The job should be cancelled') \ No newline at end of file diff --git a/src/schedule/tests/test_scheduler_config.py b/src/schedule/tests/test_scheduler_config.py index bcf2f975..bfb8afa0 100644 --- a/src/schedule/tests/test_scheduler_config.py +++ b/src/schedule/tests/test_scheduler_config.py @@ -11,7 +11,8 @@ class TestSchedulerConfig(unittest.TestCase): def setUp(self) -> None: - self.config_dict: ConfigDict = {k: None for k in REQUIRED_FIELDS} + self.config_dict: ConfigDict = {k: None for k in REQUIRED_FIELDS} + self.config_dict['timezone'] = 'America/Sao_Paulo' self._fill_start_date() def _fill_start_date(self): @@ -29,7 +30,7 @@ def _fill_personalized_repeat(self): } def test_raise_exception_if_missing_required_fields(self): - with self.assertRaises(ConfigMissingFieldError): + with self.assertRaises(ConfigMissingFieldError, msg='Should raise exception if missing required fields'): for req_field in REQUIRED_FIELDS: fields_with_missing_required_field = list(REQUIRED_FIELDS) fields_with_missing_required_field.remove(req_field) @@ -47,7 +48,7 @@ def test_raise_exception_with_invalid_start_date(self): for invalid_input in (None, invalid_format_date, past_date_str): self.config_dict['start_date'] = invalid_input - with self.assertRaises(ConfigValueError): + with self.assertRaises(ConfigValueError, msg=f'Should raise exception if start_date is {invalid_input}'): Config.valid_config(self.config_dict) def test_raise_exception_with_invalid_repeat_mode(self): @@ -58,11 +59,11 @@ def test_raise_exception_with_invalid_repeat_mode(self): def test_raise_if_personalized_repeat_value_is_not_dict(self): self.config_dict['repeat_mode'] = PERSONALIZED_REPEAT_MODE self.config_dict['personalized_repeat'] = None - with self.assertRaises(ConfigValueError): + with self.assertRaises(ConfigValueError, msg='Should raise exception if personalized_repeat is not a dict'): Config.valid_config(self.config_dict) def test_raise_if_personalized_repeat_is_missing_required_fields(self): - with self.assertRaises(ConfigMissingFieldError): + with self.assertRaises(ConfigMissingFieldError, msg='Should raise exception if missing required fields'): for req_field in PERSONALIZED_REQUIRED_FIELDS: fields_with_missing_required_field = list(PERSONALIZED_REQUIRED_FIELDS) fields_with_missing_required_field.remove(req_field) @@ -74,11 +75,11 @@ def test_raise_if_personalized_repeat_has_invalid_repeat_interval(self): self.config_dict['personalized_repeat']['interval'] = '1' - with self.assertRaises(ConfigValueError): + with self.assertRaises(ConfigValueError, msg='Should raise exception if personalized_repeat interval is not a int'): Config.valid_config(self.config_dict) self.config_dict['personalized_repeat']['interval'] = -1 - with self.assertRaises(ConfigValueError): + with self.assertRaises(ConfigValueError, msg='Should raise exception if personalized_repeat interval is not greater than 0'): Config.valid_config(self.config_dict) def test_raise_if_personalized_repeat_has_invalid_repeat_mode(self): @@ -86,7 +87,7 @@ def test_raise_if_personalized_repeat_has_invalid_repeat_mode(self): self.config_dict['personalized_repeat']['mode'] = 'unknow_mode' - with self.assertRaises(ConfigInvalidRepeatModeError): + with self.assertRaises(ConfigInvalidRepeatModeError, msg='Should raise exception if personalized_repeat mode is invalid'): Config.valid_config(self.config_dict) def test_raise_if_personalized_repeat_mode_has_invalid_weekly_config(self): @@ -94,29 +95,29 @@ def test_raise_if_personalized_repeat_mode_has_invalid_weekly_config(self): self.config_dict['personalized_repeat']['mode'] = WEEKLY_REPEAT_MODE - with self.assertRaises(ConfigValueError): + with self.assertRaises(ConfigValueError, msg='Should raise exception if personalized_repeat mode is invalid'): Config.valid_config(self.config_dict) self.config_dict['personalized_repeat']['data'] = [] - with self.assertRaises(ConfigValueError): + with self.assertRaises(ConfigValueError, msg='Should raise exception if personalized_repeat data is empty'): Config.valid_config(self.config_dict) self.config_dict['personalized_repeat']['data'] = [7] - with self.assertRaises(ConfigValueError): + with self.assertRaises(ConfigValueError, msg='Should raise exception if personalized_repeat data is invalid'): Config.valid_config(self.config_dict) self.config_dict['personalized_repeat']['data'] = [-1] - with self.assertRaises(ConfigValueError): + with self.assertRaises(ConfigValueError, msg='Should raise exception if personalized_repeat data is invalid'): Config.valid_config(self.config_dict) def test_raise_if_personalized_repeat_mode_has_invalid_monthly_config(self): self._fill_personalized_repeat() self.config_dict['personalized_repeat']['mode'] = MONTHLY_REPEAT_MODE - with self.assertRaises(ConfigValueError): + with self.assertRaises(ConfigValueError, msg='Should raise exception if personalized_repeat data is invalid'): Config.valid_config(self.config_dict) self.config_dict['personalized_repeat']['data'] = {} @@ -128,7 +129,7 @@ def test_raise_if_personalized_repeat_mode_has_invalid_monthly_config(self): fields_with_missing_required_field = required_fields.copy() fields_with_missing_required_field.remove(req_field) self.config_dict['personalized_repeat']['data'] = {field: None for field in fields_with_missing_required_field} - with self.assertRaises(ConfigMissingFieldError): + with self.assertRaises(ConfigMissingFieldError, msg=f'Should raise exception if missing required fields'): Config.valid_config(self.config_dict) # Personalized monthly repeat type of type DAY-X must receive a integer in the field `value` of @@ -138,7 +139,7 @@ def test_raise_if_personalized_repeat_mode_has_invalid_monthly_config(self): 'mode': MONTHLY_DAY_X_OCCURRENCE_TYPE, 'value':invalid_value } - with self.assertRaises(ConfigValueError): + with self.assertRaises(ConfigValueError, msg=f'Should raise exception if personalized_repeat data is invalid'): Config.valid_config(self.config_dict) # Personalized monthly repeat type of type first-weekday or last-weekday must receive a integer in the field `value` of @@ -149,7 +150,7 @@ def test_raise_if_personalized_repeat_mode_has_invalid_monthly_config(self): 'mode': mode, 'value':invalid_value } - with self.assertRaises(ConfigValueError): + with self.assertRaises(ConfigValueError, msg=f'Should raise exception if personalized_repeat data is invalid'): Config.valid_config(self.config_dict) def test_raise_if_personalized_repeat_mode_finish_has_invalid_config(self): @@ -157,7 +158,7 @@ def test_raise_if_personalized_repeat_mode_finish_has_invalid_config(self): self.config_dict['personalized_repeat']['finish'] = {} - with self.assertRaises(ConfigMissingFieldError): + with self.assertRaises(ConfigMissingFieldError, msg='Should raise exception if missing required fields'): Config.valid_config(self.config_dict) self.config_dict['personalized_repeat']['finish'] = { @@ -165,7 +166,7 @@ def test_raise_if_personalized_repeat_mode_finish_has_invalid_config(self): 'value': None } - with self.assertRaises(ConfigInvalidRepeatModeError): + with self.assertRaises(ConfigInvalidRepeatModeError, msg='Should raise exception if personalized_repeat finish mode is invalid'): Config.valid_config(self.config_dict) for invalid_input in ('-100', 0): @@ -174,7 +175,7 @@ def test_raise_if_personalized_repeat_mode_finish_has_invalid_config(self): 'value': invalid_input } - with self.assertRaises(ConfigValueError): + with self.assertRaises(ConfigValueError, msg='Should raise exception if personalized_repeat finish value is invalid'): Config.valid_config(self.config_dict) now = datetime.datetime.now() @@ -191,5 +192,5 @@ def test_raise_if_personalized_repeat_mode_finish_has_invalid_config(self): 'value': invalid_input } - with self.assertRaises(ConfigValueError): + with self.assertRaises(ConfigValueError, msg='Should raise exception if personalized_repeat finish value is invalid'): Config.valid_config(self.config_dict) From 80c681b8c25f368c126d37275df74915c305a326 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Thu, 1 Jun 2023 09:19:40 -0300 Subject: [PATCH 68/89] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8a7bba0e..d6072587 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# C01 +# C01 Existem coletores bases, que podem ser personalizados através da interface feita em django. Eles são capazes de coletar: From a75f302e7e0dcfc8aa92d9ebeebe0768f03bc410 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Mon, 5 Jun 2023 17:03:06 -0300 Subject: [PATCH 69/89] API partial refactoring --- api/__init__.py | 0 api/admin.py | 3 + api/apps.py | 6 + api/migrations/__init__.py | 0 api/models.py | 3 + api/templates/api/swagger-ui.html | 31 + api/tests.py | 3 + api/urls.py | 68 ++ api/views/__init__.py | 4 + api/views/crawler.py | 122 +++ api/views/crawler_instance.py | 44 + api/views/crawler_queue.py | 106 +++ api/views/debugging/__init__.py | 0 api/views/debugging/log.py | 99 ++ api/views/debugging/screenshots.py | 55 ++ api/views/debugging/trace.py | 34 + api/views/status/__init__.py | 0 api/views/status/file_download.py | 80 ++ api/views/status/page_crawling.py | 86 ++ api/views/task.py | 98 ++ crawler_manager/settings.py | 6 +- interface/settings.py | 3 +- interface/urls.py | 3 +- main/custom_schema_generator.py | 0 main/staticfiles/json/steps_signature.json | 2 +- main/urls.py | 83 +- main/utils.py | 262 +++++ main/views.py | 1000 +------------------- 28 files changed, 1168 insertions(+), 1033 deletions(-) create mode 100644 api/__init__.py create mode 100644 api/admin.py create mode 100644 api/apps.py create mode 100644 api/migrations/__init__.py create mode 100644 api/models.py create mode 100644 api/templates/api/swagger-ui.html create mode 100644 api/tests.py create mode 100644 api/urls.py create mode 100644 api/views/__init__.py create mode 100644 api/views/crawler.py create mode 100644 api/views/crawler_instance.py create mode 100644 api/views/crawler_queue.py create mode 100644 api/views/debugging/__init__.py create mode 100644 api/views/debugging/log.py create mode 100644 api/views/debugging/screenshots.py create mode 100644 api/views/debugging/trace.py create mode 100644 api/views/status/__init__.py create mode 100644 api/views/status/file_download.py create mode 100644 api/views/status/page_crawling.py create mode 100644 api/views/task.py create mode 100644 main/custom_schema_generator.py create mode 100644 main/utils.py diff --git a/api/__init__.py b/api/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/api/admin.py b/api/admin.py new file mode 100644 index 00000000..8c38f3f3 --- /dev/null +++ b/api/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/api/apps.py b/api/apps.py new file mode 100644 index 00000000..66656fd2 --- /dev/null +++ b/api/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class ApiConfig(AppConfig): + default_auto_field = 'django.db.models.BigAutoField' + name = 'api' diff --git a/api/migrations/__init__.py b/api/migrations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/api/models.py b/api/models.py new file mode 100644 index 00000000..71a83623 --- /dev/null +++ b/api/models.py @@ -0,0 +1,3 @@ +from django.db import models + +# Create your models here. diff --git a/api/templates/api/swagger-ui.html b/api/templates/api/swagger-ui.html new file mode 100644 index 00000000..73415c62 --- /dev/null +++ b/api/templates/api/swagger-ui.html @@ -0,0 +1,31 @@ + + + + + Swagger + + + + + + +
    + + + + + \ No newline at end of file diff --git a/api/tests.py b/api/tests.py new file mode 100644 index 00000000..7ce503c2 --- /dev/null +++ b/api/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/api/urls.py b/api/urls.py new file mode 100644 index 00000000..8b938e50 --- /dev/null +++ b/api/urls.py @@ -0,0 +1,68 @@ +from django.urls import path +from django.views.generic import TemplateView +from rest_framework.schemas import get_schema_view +from django.shortcuts import redirect +from . import views + +app_name = 'api' + +# Router for API endpoints +# api_router = routers.DefaultRouter() +# api_router.register(r'crawlers', views.CrawlerViewSet) +# api_router.register(r'instances', views.CrawlerInstanceViewSet) +# api_router.register(r'crawler_queue', views.CrawlerQueueViewSet) +# api_router.register(r'tasks', views.TaskViewSet) + +list_and_create_actions = {'get': 'list', 'post': 'create'} +retrieve_update_and_destroy_actions = {'get': 'retrieve', 'put': 'update', 'delete': 'destroy'} +all_actions = {'get': 'list', 'post': 'create', 'put': 'update', 'delete': 'destroy'} +only_list_action = {'get': 'list'} +only_retrieve_action = {'get': 'retrieve'} + +urlpatterns = [ + path('', lambda request: redirect('api:swagger-ui', permanent=True)), + + path('crawler/', views.CrawlerViewSet.as_view(list_and_create_actions), name='crawler'), + path('crawler/', views.CrawlerViewSet.as_view(retrieve_update_and_destroy_actions), name='crawler-detail'), + path('crawler//run', views.CrawlerViewSet.as_view({'get': 'run'}), name='crawler-run'), + path('crawler//stop', views.CrawlerViewSet.as_view({'get': 'stop'}), name='crawler-run'), + path('crawler//group', views.CrawlerViewSet.as_view({'get': 'group'}), name='crawler-group'), + + path('instance/', views.CrawlerInstanceViewSet.as_view(only_list_action), name='instance'), + path('instance/', views.CrawlerInstanceViewSet.as_view(only_retrieve_action), name='instance-detail'), + path('instance//export_config', views.CrawlerInstanceViewSet.as_view({'get': 'export_config'}), name='instance-export-config'), + + path('task/', views.TaskViewSet.as_view(list_and_create_actions), name='task'), + path('task/', views.TaskViewSet.as_view(retrieve_update_and_destroy_actions), name='task-detail'), + path('task//filter', views.TaskViewSet.as_view({'get': 'filter'}), name='task-filter'), + + path('queue/', views.CrawlerQueueViewSet.as_view({'get': 'retrieve', 'put': 'update'}), name='queue'), + path('queue/switch_position//', views.CrawlerQueueViewSet.as_view({'get': 'switch_position'}), name='queue-switch-position'), + path('queue/force_execution/', views.CrawlerQueueViewSet.as_view({'get': 'force_execution'}), name='queue-force-execution'), + path('queue/remove_item/', views.CrawlerQueueViewSet.as_view({'get': 'remove_item'}), name='queue-remove-item'), + + path('open-api/', get_schema_view( + title='Plataforma de Coletas - API', + description='API para as principais funcionalidades da plataforma de coletas.', + version='1.0.0', + public=True, + url='/api/', + urlconf='api.urls' + ), name='open-api'), + path('swagger-ui/', TemplateView.as_view( + template_name='api/swagger-ui.html', + extra_context={'schema_url':'api:open-api'} + ), name='swagger-ui') +] + +# # Includes the API endpoints in the URLs +# url(r'^api/', include(api_router.urls)), +# path('openapi/', get_schema_view( +# title='Áduna', +# description='API para busca de dados não estruturados', +# url='/services/', +# version='1.0.0', +# urlconf='services.urls', +# public=True, +# ), name='openapi'), +# ] diff --git a/api/views/__init__.py b/api/views/__init__.py new file mode 100644 index 00000000..a333f6f4 --- /dev/null +++ b/api/views/__init__.py @@ -0,0 +1,4 @@ +from .crawler import CrawlerViewSet +from .crawler_instance import CrawlerInstanceViewSet +from .crawler_queue import CrawlerQueueViewSet +from .task import TaskViewSet \ No newline at end of file diff --git a/api/views/crawler.py b/api/views/crawler.py new file mode 100644 index 00000000..b3b22e00 --- /dev/null +++ b/api/views/crawler.py @@ -0,0 +1,122 @@ +from rest_framework import status, viewsets +from rest_framework.decorators import action +from rest_framework.response import Response + +from django.db import transaction + +from main.models import CrawlRequest, ParameterHandler, ResponseHandler +from main.serializers import CrawlRequestSerializer + +from main.utils import (add_crawl_request, unqueue_crawl_requests, + process_run_crawl, process_stop_crawl) + +class CrawlerViewSet(viewsets.ModelViewSet): + """ + ViewSet that allows crawlers to be viewed, edited, updated and removed. + """ + queryset = CrawlRequest.objects.all().order_by('-creation_date') + serializer_class = CrawlRequestSerializer + + def _create_templated_url_parameter_handlers(self, parameter_handlers, crawler_id): + for handler in parameter_handlers: + handler['crawler_id'] = crawler_id + handler['injection_type'] = 'templated_url' + ParameterHandler.objects.create(**handler) + + def _create_templated_url_response_handlers(self, response_handlers, crawler_id): + for handler in response_handlers: + handler['crawler_id'] = crawler_id + handler['injection_type'] = 'templated_url' + ResponseHandler.objects.create(**handler) + + def create(self, request, *args, **kwargs): + """ + Create a new crawler. + """ + data = request.data + + if type(data) is not dict: + data = data.dict() + + templated_url_parameter_handlers = data.pop('templated_url_parameter_handlers', []) + templated_url_response_handlers = data.pop('templated_url_response_handlers', []) + + serializer = CrawlRequestSerializer(data=request.data) + if serializer.is_valid(): + with transaction.atomic(): + serializer.save() + + crawler_id = serializer.data['id'] + + self._create_templated_url_parameter_handlers(templated_url_parameter_handlers, crawler_id) + self._create_templated_url_response_handlers(templated_url_response_handlers, crawler_id) + + return Response({'id': crawler_id}, status=status.HTTP_201_CREATED) + + return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST) + + @action(detail=True, methods=['get']) + def run(self, request, pk): + query_params = self.request.query_params.dict() + action = query_params.get('action', '') + + if action == 'run_immediately': + wait_on = 'no_wait' + + add_crawl_request(pk, wait_on) + instance = process_run_crawl(pk) + + return Response({'instance_id': instance.instance_id}, status=status.HTTP_201_CREATED) + + elif action == 'wait_on_first_queue_position': + wait_on = 'first_position' + + else: + wait_on = 'last_position' + + try: + add_crawl_request(pk, wait_on) + + crawl_request = CrawlRequest.objects.get(pk=pk) + queue_type = crawl_request.expected_runtime_category + + unqueue_crawl_requests(queue_type) + + except Exception as e: + return Response({'error': str(e)}, status=status.HTTP_400_BAD_REQUEST) + + if wait_on == 'first_position': + message = f'Crawler added to crawler queue in first position' + + else: + message = f'Crawler added to crawler queue in last position' + + return Response({'message': message}, status=status.HTTP_200_OK) + + @action(detail=True, methods=['get']) + def stop(self, request, pk): + try: + process_stop_crawl(pk) + except Exception as e: + return Response({'error': str(e)}, status=status.HTTP_400_BAD_REQUEST) + + return Response(status=status.HTTP_204_NO_CONTENT) + + @action(detail=True, methods=['get']) + def group(self, request, pk): + crawlers = CrawlRequest.objects.raw( + "select id, source_name \ + from main_crawlrequest \ + where steps=( \ + select steps from main_crawlrequest where id = "+str(pk)+") order by id desc") + + json_data = [] + for item in crawlers: + json_data.append({ + 'id': item.id, + 'source_name': item.source_name, + 'last_modified': item.last_modified, + 'base_url': item.base_url, + }) + + return Response(json_data, status=status.HTTP_200_OK) \ No newline at end of file diff --git a/api/views/crawler_instance.py b/api/views/crawler_instance.py new file mode 100644 index 00000000..0066ce1b --- /dev/null +++ b/api/views/crawler_instance.py @@ -0,0 +1,44 @@ +import os + +from rest_framework import viewsets +from rest_framework.decorators import action +from rest_framework.response import Response +from rest_framework import status +from django.http import FileResponse + +from main.models import CrawlerInstance +from main.serializers import CrawlerInstanceSerializer + +from crawler_manager.settings import OUTPUT_FOLDER + +class CrawlerInstanceViewSet(viewsets.ReadOnlyModelViewSet): + """ + A simple ViewSet for viewing and listing instances + """ + queryset = CrawlerInstance.objects.all() + serializer_class = CrawlerInstanceSerializer + + @action(detail=True, methods=['get']) + def export_config(self, request, pk): + try: + instance = CrawlerInstance.objects.get(pk=pk) + + except: + return Response({'error': f'Crawler instance {pk} not found!'}, status=status.HTTP_404_NOT_FOUND) + + data_path = instance.crawler.data_path + file_name = f'{pk}.json' + rel_path = os.path.join(data_path, str(pk), 'config', file_name) + path = os.path.join(OUTPUT_FOLDER, rel_path) + + try: + response = FileResponse(open(path, 'rb'), content_type='application/json', status=status.HTTP_200_OK) + + except FileNotFoundError: + response = Response({'error': f'Arquivo de Configuração Não Encontrado: {file_name}'}, status=status.HTTP_404_NOT_FOUND) + + else: + response['Content-Length'] = os.path.getsize(path) + response['Content-Disposition'] = 'attachment; filename=%s' % file_name + + return response \ No newline at end of file diff --git a/api/views/crawler_queue.py b/api/views/crawler_queue.py new file mode 100644 index 00000000..42d75b1a --- /dev/null +++ b/api/views/crawler_queue.py @@ -0,0 +1,106 @@ +from django.core.exceptions import ObjectDoesNotExist +from django.db import transaction +from django.http import JsonResponse +from rest_framework import status, viewsets +from rest_framework.decorators import action +from rest_framework.response import Response + +from main.models import CrawlerQueue, CrawlerQueueItem, CRAWLER_QUEUE_DB_ID +from main.serializers import CrawlerQueueSerializer +from main.utils import (process_run_crawl, unqueue_crawl_requests, CRAWLER_QUEUE) + +class CrawlerQueueViewSet(viewsets.ModelViewSet): + queryset = CrawlerQueue.objects.all() + serializer_class = CrawlerQueueSerializer + http_method_names = ['get', 'put'] + + def retrieve(self, request): + crawler_queue = CrawlerQueue.to_dict() + return Response(crawler_queue) + + @action(detail=False, methods=['get']) + def switch_position(self, request, a: int, b: int): + with transaction.atomic(): + try: + queue_item_a = CrawlerQueueItem.objects.get(pk=a) + + except ObjectDoesNotExist: + return Response({'error': f'Crawler queue item {a} not found!'}, status=status.HTTP_404_NOT_FOUND) + + try: + queue_item_b = CrawlerQueueItem.objects.get(pk=b) + + except ObjectDoesNotExist: + return Response({'error': f'Crawler queue item {b} not found!'}, status=status.HTTP_404_NOT_FOUND) + + if queue_item_a.queue_type != queue_item_b.queue_type: + return Response({'error': 'Crawler queue items must be in same queue!'}, status=status.HTTP_400_BAD_REQUEST) + + position_aux = queue_item_a.position + + queue_item_a.position = queue_item_b.position + queue_item_b.position = position_aux + + queue_item_a.save() + queue_item_b.save() + + return Response({'message': 'success'}, status=status.HTTP_200_OK) + + @action(detail=False, methods=['get']) + def force_execution(self, request, item_id: int): + with transaction.atomic(): + try: + queue_item = CrawlerQueueItem.objects.get(pk=item_id) + + except ObjectDoesNotExist: + return Response({'error': f'Crawler queue item {item_id} not found!'}, status=status.HTTP_404_NOT_FOUND) + + crawler_id = queue_item.crawl_request.id + + instance = process_run_crawl(crawler_id) + + queue_item.forced_execution = True + queue_item.running = True + queue_item.save() + + data = { + 'crawler_id': crawler_id, + 'instance_id': instance.pk + } + + return Response(data, status=status.HTTP_200_OK) + + @action(detail=False, methods=['get']) + def remove_item(self, request, item_id: int): + try: + queue_item = CrawlerQueueItem.objects.get(pk=item_id) + queue_item.delete() + + except ObjectDoesNotExist: + return Response({'error': f'Crawler queue item {item_id} not found!'}, status=status.HTTP_404_NOT_FOUND) + + return Response(status=status.HTTP_204_NO_CONTENT) + + def update(self, request): + response = super().update(request, pk=CRAWLER_QUEUE_DB_ID) + + # updade crawler queue instance with new configs + global CRAWLER_QUEUE + CRAWLER_QUEUE = CrawlerQueue.object() + + # the size of queue of type fast changed, may is possible run + # more crawlers + if 'max_fast_runtime_crawlers_running' in request.data: + unqueue_crawl_requests('fast') + + # the size of queue of type normal changed, may is possible run + # more crawlers + if 'max_medium_runtime_crawlers_running' in request.data: + unqueue_crawl_requests('medium') + + # the size of queue of type slow changed, may is possible run + # more crawlers + if 'max_slow_runtime_crawlers_running' in request.data: + unqueue_crawl_requests('slow') + + return response \ No newline at end of file diff --git a/api/views/debugging/__init__.py b/api/views/debugging/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/api/views/debugging/log.py b/api/views/debugging/log.py new file mode 100644 index 00000000..b409fd95 --- /dev/null +++ b/api/views/debugging/log.py @@ -0,0 +1,99 @@ +import os +import subprocess +import time +from datetime import datetime + +from rest_framework.response import Response + +from crawler_manager.settings import OUTPUT_FOLDER +from main.models import CrawlRequest, CrawlerInstance + +def raw_log_err(request, instance_id): + instance = CrawlerInstance.objects.get(instance_id=instance_id) + + config = CrawlRequest.objects.filter(id=int(instance.crawler.id)).values()[0] + data_path = os.path.join(OUTPUT_FOLDER, config["data_path"]) + + err = subprocess.run(["tail", + f"{data_path}/{instance_id}/log/{instance_id}.err", + "-n", + "100"], + stdout=subprocess.PIPE).stdout + + raw_text = err.decode('utf-8') + raw_results = raw_text.splitlines(True) + resp = Response({str(instance_id): raw_results}, + json_dumps_params={'indent': 2}) + + if len(raw_results) > 0 and instance.running: + resp['Refresh'] = 5 + + return resp + +def raw_log_out(request, instance_id): + instance = CrawlerInstance.objects.get(instance_id=instance_id) + + config = CrawlRequest.objects.filter(id=int(instance.crawler.id)).values()[0] + data_path = os.path.join(OUTPUT_FOLDER, config["data_path"]) + + out = subprocess.run(["tail", + f"{data_path}/{instance_id}/log/{instance_id}.out", + "-n", + "100"], + stdout=subprocess.PIPE).stdout + + raw_text = out.decode('utf-8') + raw_results = raw_text.splitlines(True) + resp = Response({str(instance_id): raw_results}, + json_dumps_params={'indent': 2}) + + if len(raw_results) > 0 and instance.running: + resp['Refresh'] = 5 + + return resp + +def tail_log_file(request, instance_id): + instance = CrawlerInstance.objects.get(instance_id=instance_id) + + files_found = instance.number_files_found + download_file_success = instance.number_files_success_download + download_file_error = instance.number_files_error_download + number_files_previously_crawled = instance.number_files_previously_crawled + + pages_found = instance.number_pages_found + download_page_success = instance.number_pages_success_download + download_page_error = instance.number_pages_error_download + number_pages_duplicated_download = instance.number_pages_duplicated_download + number_pages_previously_crawled = instance.number_pages_previously_crawled + + config = CrawlRequest.objects.filter(id=int(instance.crawler.id)).values()[0] + data_path = os.path.join(OUTPUT_FOLDER, config["data_path"]) + + out = subprocess.run(["tail", + f"{data_path}/{instance_id}/log/{instance_id}.out", + "-n", + "20"], + stdout=subprocess.PIPE).stdout + + err = subprocess.run(["tail", + f"{data_path}/{instance_id}/log/{instance_id}.err", + "-n", + "20"], + stdout=subprocess.PIPE).stdout + + return Response({ + "files_found": files_found, + "files_success": download_file_success, + "files_error": download_file_error, + "files_previously_crawled": number_files_previously_crawled, + + "pages_found": pages_found, + "pages_success": download_page_success, + "pages_error": download_page_error, + "pages_duplicated": number_pages_duplicated_download, + "pages_previously_crawled": number_pages_previously_crawled, + + "out": out.decode('utf-8'), + "err": err.decode('utf-8'), + "time": str(datetime.fromtimestamp(time.time())), + }) \ No newline at end of file diff --git a/api/views/debugging/screenshots.py b/api/views/debugging/screenshots.py new file mode 100644 index 00000000..ddb1d539 --- /dev/null +++ b/api/views/debugging/screenshots.py @@ -0,0 +1,55 @@ +import os +import base64 + +from rest_framework.response import Response +from rest_framework import status + +from main.models import CrawlerInstance + +def view_screenshots(request, instance_id, page): + IMGS_PER_PAGE = 20 + + try: + instance = CrawlerInstance.objects.get(pk=instance_id) # get_object_or_404(CrawlerInstance, pk=instance_id) + + except: + return Response(status=status.HTTP_404_NOT_FOUND) + + output_folder = os.getenv('OUTPUT_FOLDER', '/data') + data_path = instance.crawler.data_path + instance_path = os.path.join(output_folder, data_path, str(instance_id)) + + screenshot_dir = os.path.join(instance_path, "data", "screenshots") + + if not os.path.isdir(screenshot_dir): + return Response({ + 'error': 'Pasta de coleta não encontrada.', + 'total_screenshots': 0 + }, status=status.HTTP_200_OK) + + screenshot_list = sorted(os.listdir(screenshot_dir)) + total_screenshots = len(screenshot_list) + + if total_screenshots == 0: + return Response({ + 'error': 'Nenhum screenshot encontrado.', + 'total_screenshots': 0 + }, status=status.HTTP_200_OK) + + screenshot_list = screenshot_list[(page - 1) * IMGS_PER_PAGE: + page * IMGS_PER_PAGE] + + image_data = [] + for index, screenshot in enumerate(screenshot_list): + img_path = os.path.join(screenshot_dir, screenshot) + with open(img_path, "rb") as image: + curr_img = { + 'base64': base64.b64encode(image.read()).decode('ascii'), + 'title': str(1 + index + ((page - 1) * IMGS_PER_PAGE)) + } + image_data.append(curr_img) + + return Response({ + 'data': image_data, + 'total_screenshots': total_screenshots + }, status=status.HTTP_200_OK) \ No newline at end of file diff --git a/api/views/debugging/trace.py b/api/views/debugging/trace.py new file mode 100644 index 00000000..1fdb0fc4 --- /dev/null +++ b/api/views/debugging/trace.py @@ -0,0 +1,34 @@ +import os +from rest_framework.response import Response +from rest_framework import status + +from django.http import FileResponse +from main.models import CrawlerInstance + +from crawler_manager.settings import OUTPUT_FOLDER + +def export_trace(request, instance_id): + try: + instance = CrawlerInstance.objects.get(pk=instance_id) # get_object_or_404(CrawlerInstance, pk=instance_id) + + except: + return Response(status=status.HTTP_404_NOT_FOUND) + + data_path = instance.crawler.data_path + + file_name = f"{instance_id}.zip" + rel_path = os.path.join(data_path, str(instance_id), "debug", "trace", file_name) + path = os.path.join(OUTPUT_FOLDER, rel_path) + + try: + response = FileResponse(open(path, 'rb'), content_type='zip') + + except FileNotFoundError: + return Response({'error': 'Verifique se a opção de gerar arquivo trace foi habilitada na configuração do coletor'}, + status=status.HTTP_404_NOT_FOUND) + + else: + response['Content-Length'] = os.path.getsize(path) + response['Content-Disposition'] = "attachment; filename=%s" % file_name + + return response \ No newline at end of file diff --git a/api/views/status/__init__.py b/api/views/status/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/api/views/status/file_download.py b/api/views/status/file_download.py new file mode 100644 index 00000000..56f63a14 --- /dev/null +++ b/api/views/status/file_download.py @@ -0,0 +1,80 @@ +from rest_framework import status +from rest_framework.response import Response + +from main.models import CrawlerInstance +from main.utils import process_stop_crawl + +def files_found(request, instance_id, num_files): + try: + instance = CrawlerInstance.objects.get(instance_id=instance_id) + + except: + return Response(status=status.HTTP_404_NOT_FOUND) + + try: + instance.number_files_found += num_files + instance.save() + + return Response(status=status.HTTP_200_OK) + + except Exception as e: + return Response({'error': str(e)}, status=status.HTTP_400_BAD_REQUEST) + + +def success_download_file(request, instance_id): + try: + instance = CrawlerInstance.objects.get(instance_id=instance_id) + + except: + return Response(status=status.HTTP_404_NOT_FOUND) + + try: + instance.number_files_success_download += 1 + instance.save() + + if instance.page_crawling_finished and instance.download_files_finished(): + process_stop_crawl(instance.crawler.id) + + return Response(status=status.HTTP_200_OK) + + except Exception as e: + return Response({'error': str(e)}, status=status.HTTP_500_INTERNAL_SERVER_ERROR) + +def previously_crawled_file(request, instance_id): + try: + instance = CrawlerInstance.objects.get(instance_id=instance_id) + + except: + return Response(status=status.HTTP_404_NOT_FOUND) + + try: + instance.number_files_previously_crawled += 1 + instance.save() + + if instance.page_crawling_finished and instance.download_files_finished(): + process_stop_crawl(instance.crawler.id) + + return Response({}, status=status.HTTP_200_OK) + + except: + return Response({}, status=status.HTTP_400_BAD_REQUEST) + + +def error_download_file(request, instance_id): + try: + instance = CrawlerInstance.objects.get(instance_id=instance_id) + + except: + return Response(status=status.HTTP_404_NOT_FOUND) + + try: + instance.number_files_error_download += 1 + instance.save() + + if instance.page_crawling_finished and instance.download_files_finished(): + process_stop_crawl(instance.crawler.id) + + return Response({}, status=status.HTTP_200_OK) + + except: + return Response({}, status=status.HTTP_400_BAD_REQUEST) \ No newline at end of file diff --git a/api/views/status/page_crawling.py b/api/views/status/page_crawling.py new file mode 100644 index 00000000..b521710a --- /dev/null +++ b/api/views/status/page_crawling.py @@ -0,0 +1,86 @@ +from rest_framework.response import Response +from rest_framework import status + +from main.models import CrawlerInstance + +def pages_found(request, instance_id, num_pages): + try: + instance = CrawlerInstance.objects.get(instance_id=instance_id) + + except: + return Response(status=status.HTTP_404_NOT_FOUND) + + try: + instance.number_pages_found += num_pages + instance.save() + + return Response(status=status.HTTP_200_OK) + + except Exception as e: + return Response({'error': str(e)}, status=status.HTTP_400_BAD_REQUEST) + + +def success_download_page(request, instance_id): + try: + instance = CrawlerInstance.objects.get(instance_id=instance_id) + + except: + return Response(status=status.HTTP_404_NOT_FOUND) + + try: + instance.number_pages_success_download += 1 + instance.save() + + return Response(status=status.HTTP_200_OK) + + except Exception as e: + return Response({'error': str(e)}, status=status.HTTP_400_BAD_REQUEST) + +def previously_crawled_page(request, instance_id): + try: + instance = CrawlerInstance.objects.get(instance_id=instance_id) + + except: + return Response(status=status.HTTP_404_NOT_FOUND) + + try: + instance.number_pages_previously_crawled += 1 + instance.save() + + return Response(status=status.HTTP_200_OK) + + except Exception as e: + return Response({'error': str(e)}, status=status.HTTP_400_BAD_REQUEST) + +def error_download_page(request, instance_id): + try: + instance = CrawlerInstance.objects.get(instance_id=instance_id) + + except: + return Response(status=status.HTTP_404_NOT_FOUND) + + try: + instance.number_pages_error_download += 1 + instance.save() + + return Response(status=status.HTTP_200_OK) + + except Exception as e: + return Response({'error': str(e)}, status=status.HTTP_400_BAD_REQUEST) + + +def duplicated_download_page(request, instance_id): + try: + instance = CrawlerInstance.objects.get(instance_id=instance_id) + + except: + return Response(status=status.HTTP_404_NOT_FOUND) + + try: + instance.number_pages_duplicated_download += 1 + instance.save() + + return Response(status=status.HTTP_200_OK) + + except Exception as e: + return Response({'error': str(e)}, status=status.HTTP_400_BAD_REQUEST) diff --git a/api/views/task.py b/api/views/task.py new file mode 100644 index 00000000..ac0015bf --- /dev/null +++ b/api/views/task.py @@ -0,0 +1,98 @@ +import json +from datetime import datetime + +from rest_framework import viewsets, status +from rest_framework.response import Response +from rest_framework.decorators import action + +from main.models import Task +from main.serializers import TaskSerializer +from main.task_filter import task_filter_by_date_interval + +import crawler_manager.crawler_manager as crawler_manager +from crawler_manager.settings import TASK_TOPIC + +class TaskViewSet(viewsets.ModelViewSet): + queryset = Task.objects.all() + serializer_class = TaskSerializer + + def create(self, request): + response = super().create(request) + if response.status_code == status.HTTP_201_CREATED: + message = { + 'action': 'create', + 'data': response.data + } + crawler_manager.message_sender.send(TASK_TOPIC, message) + return response + + def update(self, request, pk=None): + response = super().update(request, pk=pk) + if response.status_code == status.HTTP_200_OK: + message = { + 'action': 'update', + 'data': response.data + } + crawler_manager.message_sender.send(TASK_TOPIC, message) + return response + + def partial_update(self, request, pk=None): + response = super().partial_update(request, pk=pk) + if response.status_code == status.HTTP_200_OK: + message = { + 'action': 'update', + 'data': response.data + } + crawler_manager.message_sender.send(TASK_TOPIC, message) + return response + + def destroy(self, request, pk=None): + response = super().destroy(request, pk=pk) + if response.status_code == status.HTTP_204_NO_CONTENT: + message = { + 'action': 'cancel', + 'data': { + 'id': pk + } + } + crawler_manager.message_sender.send(TASK_TOPIC, message) + return response + + def __str2date(self, s: str) -> datetime: + date = None + + try: + date = datetime.strptime(s, '%d-%m-%Y') + + except Exception as e: + print(e) + + return date + + @action(detail=False) + def filter(self, request): + query_params = self.request.query_params.dict() + + end_date = None + start_date = None + + if 'end_date' in query_params: + end_date = self.__str2date(query_params['end_date']) + + start_date = None + if 'start_date' in query_params: + start_date = self.__str2date(query_params['start_date']) + if end_date is None or start_date is None: + msg = {'message': 'You must send the params start_date and end_date, both in the format day-month-year' + + ' in the query params of the url. Eg.: ?start_date=23-04-2023&end_date=01-01-2020, etc.'} + + return Response(msg, status=status.HTTP_400_BAD_REQUEST) + + queryset = self.get_queryset() + serializer = self.get_serializer(queryset, many=True) + + # serializer.data is ordered_dict + tasks = json.loads(json.dumps(serializer.data)) + data = task_filter_by_date_interval(tasks, start_date, end_date) + + return Response(data, status=status.HTTP_200_OK) \ No newline at end of file diff --git a/crawler_manager/settings.py b/crawler_manager/settings.py index 6a07b7a3..d25071ee 100644 --- a/crawler_manager/settings.py +++ b/crawler_manager/settings.py @@ -3,7 +3,7 @@ # Kafka host information KAFKA_TOPIC_PREFIX = os.getenv('KAFKA_TOPIC_PREFIX', 'crawler_ufmg') -KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'kafka:9092').split(',')] +KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'localhost:9092').split(',')] KAFKA_CONSUMER_AUTO_OFFSET_RESET = 'earliest' KAFKA_CONSUMER_TIMEOUT = 120000 KAFKA_CONSUMER_COMMIT_INTERVAL_MS = 5000 @@ -16,7 +16,7 @@ KAFKA_SESSION_TIMEOUT_MS = 2 * 60 * 1000 # Redis host information -REDIS_HOST = os.getenv('REDIS_HOST', 'redis') +REDIS_HOST = os.getenv('REDIS_HOST', 'localhost') REDIS_PORT = int(os.getenv('REDIS_PORT', 6379)) REDIS_DB = int(os.getenv('REDIS_DB', 0)) REDIS_PASSWORD = os.getenv('REDIS_PASSWORD', None) @@ -39,7 +39,7 @@ WRITER_TOPIC = os.getenv('WRITER_TOPIC', KAFKA_TOPIC_PREFIX + '.writer') STOPPED_SPIDER_NOTIFICATION_ADDRESS = os.getenv( - 'STOPPED_SPIDER_NOTIFICATION_ADDRESS', 'http://web:8000/detail/stop_crawl/{crawler_id}') + 'STOPPED_SPIDER_NOTIFICATION_ADDRESS', 'http://localhost:8000/detail/stop_crawl/{crawler_id}') TASK_TOPIC = os.getenv('TASK_TOPIC', KAFKA_TOPIC_PREFIX + 'task_topic') TASK_DATA_CONSUMER_GROUP = os.getenv('TASK_DATA_CONSUMER_DATA', KAFKA_TOPIC_PREFIX + '.task_data_group') diff --git a/interface/settings.py b/interface/settings.py index c92b72c7..ccc5caab 100644 --- a/interface/settings.py +++ b/interface/settings.py @@ -44,7 +44,7 @@ SECRET_KEY = get_random_secret_key() # SECURITY WARNING: don't run with debug turned on in production! -DEBUG = env('DEBUG') +DEBUG = True#env('DEBUG') ALLOWED_HOSTS = env('DJANGO_ALLOWED_HOSTS') @@ -61,6 +61,7 @@ 'crispy_forms', 'rest_framework', 'mathfilters', + 'api.apps.ApiConfig' ] CRISPY_TEMPLATE_PACK = 'bootstrap4' diff --git a/interface/urls.py b/interface/urls.py index 58c33c11..555b7291 100644 --- a/interface/urls.py +++ b/interface/urls.py @@ -21,5 +21,6 @@ urlpatterns = [ path('admin/', admin.site.urls), - path('', include("main.urls")), + path('', include('main.urls')), + path('api/', include('api.urls')), ] + static(settings.STATIC_URL, document_root=settings.STATIC_ROOT) \ No newline at end of file diff --git a/main/custom_schema_generator.py b/main/custom_schema_generator.py new file mode 100644 index 00000000..e69de29b diff --git a/main/staticfiles/json/steps_signature.json b/main/staticfiles/json/steps_signature.json index 13520b6b..47a4391f 100644 --- a/main/staticfiles/json/steps_signature.json +++ b/main/staticfiles/json/steps_signature.json @@ -1 +1 @@ -[{"name": "break_image_captcha", "mandatory_params": ["xpath_input", "xpath_output"], "optional_params": {"preprocessing": null}}, {"name": "clique", "mandatory_params": ["xpath"], "optional_params": {}}, {"name": "cssify", "mandatory_params": ["xpath"], "optional_params": {}}, {"name": "digite", "mandatory_params": ["xpath", "texto"], "optional_params": {}}, {"name": "element_in_page", "mandatory_params": ["xpath"], "optional_params": {}}, {"name": "espere", "mandatory_params": ["segs"], "optional_params": {}}, {"name": "for_clicavel", "mandatory_params": ["xpath"], "optional_params": {}}, {"name": "gera_nome_arquivo", "mandatory_params": [], "optional_params": {}}, {"name": "nesse_elemento_esta_escrito", "mandatory_params": ["xpath", "texto"], "optional_params": {}}, {"name": "opcoes", "mandatory_params": ["xpath"], "optional_params": {"exceto": null}}, {"name": "pegue_os_links_da_paginacao", "mandatory_params": ["xpath_dos_botoes", "xpath_dos_links"], "optional_params": {"indice_do_botao_proximo": -1}}, {"name": "print_", "mandatory_params": ["word"], "optional_params": {}}, {"name": "range_", "mandatory_params": ["stop"], "optional_params": {}}, {"name": "salva_pagina", "mandatory_params": [], "optional_params": {}}, {"name": "selecione", "mandatory_params": ["xpath", "opcao"], "optional_params": {}}, {"name": "wait_page", "mandatory_params": [], "optional_params": {}}] \ No newline at end of file +[{"name": "clique", "name_display": "Clicar", "executable_contexts": ["page", "tab", "iframe"], "mandatory_params": ["elemento"], "optional_params": {}, "field_options": {}}, {"name": "comparacao", "name_display": "Compara\u00e7\u00e3o", "executable_contexts": ["page", "tab", "iframe"], "mandatory_params": ["arg1", "comp", "arg2"], "optional_params": {}, "field_options": {"comp": {"field_type": "select", "select_options": ["'=='", "'<='", "'>='", "'<'", "'>'", "'!='"]}}}, {"name": "digite", "name_display": "Digitar em", "executable_contexts": ["page", "tab", "iframe"], "mandatory_params": ["xpath", "texto"], "optional_params": {}, "field_options": {}}, {"name": "elemento_existe_na_pagina", "name_display": "Checar se elemento existe na p\u00e1gina", "executable_contexts": ["page", "tab", "iframe"], "mandatory_params": ["xpath"], "optional_params": {}, "field_options": {}}, {"name": "espere", "name_display": "Esperar", "executable_contexts": ["page", "tab", "iframe"], "mandatory_params": ["segundos"], "optional_params": {}, "field_options": {"segundos": {"field_type": "number", "input_placeholder": "Espera em segundos"}}}, {"name": "extrai_texto", "name_display": "Extrair texto de", "executable_contexts": ["page", "tab", "iframe"], "mandatory_params": ["xpath"], "optional_params": {}, "field_options": {}}, {"name": "for_clicavel", "name_display": "\u00c9 clic\u00e1vel", "executable_contexts": ["page", "tab", "iframe"], "mandatory_params": ["xpath"], "optional_params": {}, "field_options": {}}, {"name": "imprime", "name_display": "Imprimir", "executable_contexts": ["page", "tab", "iframe"], "mandatory_params": ["texto"], "optional_params": {}, "field_options": {}}, {"name": "localiza_elementos", "name_display": "Localizar elementos", "executable_contexts": ["page", "tab", "iframe"], "mandatory_params": ["xpath"], "optional_params": {"numero_xpaths": null, "modo": "Modo Simples"}, "field_options": {"modo": {"field_type": "select", "select_options": ["'Modo Simples'", "'XPath Complexos'"]}}}, {"name": "nesse_elemento_esta_escrito", "name_display": "Est\u00e1 escrito", "executable_contexts": ["page", "tab", "iframe"], "mandatory_params": ["xpath", "texto"], "optional_params": {}, "field_options": {}}, {"name": "objeto", "name_display": "Objeto", "executable_contexts": ["page", "tab", "iframe"], "mandatory_params": ["objeto"], "optional_params": {}, "field_options": {}}, {"name": "opcoes", "name_display": "Op\u00e7\u00f5es", "executable_contexts": ["page", "tab", "iframe"], "mandatory_params": ["xpath"], "optional_params": {"exceto": null}, "field_options": {}}, {"name": "quebrar_captcha_imagem", "name_display": "Quebrar captcha de imagem", "executable_contexts": ["page", "tab", "iframe"], "mandatory_params": ["xpath_do_elemento_captcha", "xpath_do_campo_a_preencher"], "optional_params": {"funcao_preprocessamento": null}, "field_options": {}}, {"name": "repete", "name_display": "Repetir", "executable_contexts": ["page", "tab", "iframe"], "mandatory_params": ["vezes"], "optional_params": {}, "field_options": {}}, {"name": "retorna_pagina", "name_display": "Voltar", "executable_contexts": ["page", "tab"], "mandatory_params": [], "optional_params": {}, "field_options": {}}, {"name": "run_javascript", "name_display": "Executar c\u00f3digo Javascript", "executable_contexts": ["page", "tab", "iframe"], "mandatory_params": ["codigo_javascript"], "optional_params": {}, "field_options": {}}, {"name": "salva_pagina", "name_display": "Salvar p\u00e1gina", "executable_contexts": ["page", "tab", "iframe"], "mandatory_params": [], "optional_params": {}, "field_options": {}}, {"name": "selecione", "name_display": "Selecionar", "executable_contexts": ["page", "tab", "iframe"], "mandatory_params": ["xpath", "opcao"], "optional_params": {}, "field_options": {}}] \ No newline at end of file diff --git a/main/urls.py b/main/urls.py index 41e62785..7d272d78 100644 --- a/main/urls.py +++ b/main/urls.py @@ -1,59 +1,36 @@ -from django.urls import include, path -from django.conf.urls import url -from rest_framework import routers +from django.urls import path from . import views -# Router for API endpoints -api_router = routers.DefaultRouter() -api_router.register(r'crawlers', views.CrawlerViewSet) -api_router.register(r'instances', views.CrawlerInstanceViewSet) -api_router.register(r'crawler_queue', views.CrawlerQueueViewSet) -api_router.register(r'tasks', views.TaskViewSet) - urlpatterns = [ - path("", views.list_crawlers, name="list_crawlers"), - path("crawlers/", views.list_crawlers, name="list_crawlers"), - path('grouped_crawlers', views.list_grouped_crawlers, name="list_grouped_crawlers"), - path("new/", views.create_crawler, name="create_crawler"), - path("new_group/", views.create_grouped_crawlers, name="create_grouped_crawlers"), - path("edit//", views.edit_crawler, name="edit_crawler"), - path("edit_group//", views.edit_grouped_crawlers, name="edit_grouped_crawlers"), - path("delete//", views.delete_crawler, name="delete_crawler"), - path("detail//", views.detail_crawler, name="detail_crawler"), - path("crawlers/steps/", views.create_steps, name="create_steps"), - path("monitoring/", views.monitoring, name="monitoring"), - path("detail/run_crawl/", views.run_crawl, name="run_crawl"), - path("detail/stop_crawl/", views.stop_crawl, name="stop_crawl"), - path("tail_log_file/", views.tail_log_file, name="tail_log_file"), - path("raw_log_out/", views.raw_log_out, name="raw_log_out"), - path("raw_log_err/", views.raw_log_err, name="raw_log_err"), + path('', views.list_crawlers, name='list_crawlers'), - path("download/files/found//", views.files_found, name="files_found"), - path("download/file/success/", views.success_download_file, name="success_download_file"), - path("download/file/previously_crawled/", views.previously_crawled_file, name="previously_crawled_file"), - path("download/file/error/", views.error_download_file, name="error_download_file"), + # crawlers + path('new/', views.create_crawler, name='create_crawler'), + path('crawlers/', views.list_crawlers, name='list_crawlers'), + path('crawlers/steps/', views.create_steps, name='create_steps'), - path("download/pages/found//", views.pages_found, name="pages_found"), - path("download/page/success/", views.success_download_page, name="success_download_page"), - path("download/page/previously_crawled/", views.previously_crawled_page, name="previously_crawled_page"), - path("download/page/error/", views.error_download_page, name="error_download_page"), - path("download/page/duplicated/", views.duplicated_download_page, name="duplicated_download_page"), + path('detail//', views.detail_crawler, name='detail_crawler'), + path('detail/run_crawl/', views.run_crawl, name='run_crawl'), + path('detail/stop_crawl/', views.stop_crawl, name='stop_crawl'), - path("export_config/", views.export_config, name="export_config"), - path("export_trace/", views.export_trace, name="export_trace"), - - path("info/screenshots//", views.view_screenshots, name="view_screenshots"), - - path("iframe/load", views.load_iframe, name="load_iframe"), - - path('list_process', views.list_process, name="list_process"), - path('get_crawlers_from_same_group/', views.get_crawlers_from_same_group, name="get_crawlers_from_same_group"), - - path("crawler_queue/", views.crawler_queue, name="crawler_queue"), - path("crawler_queue/remove/", views.remove_crawl_request_view, name="remove_crawl_request"), - - path("scheduler/", views.scheduler, name="scheduler"), - - # Includes the API endpoints in the URLs - url(r'^api/', include(api_router.urls)), -] + path('edit//', views.edit_crawler, name='edit_crawler'), + path('edit_group//', views.edit_grouped_crawlers, name='edit_grouped_crawlers'), + + path('delete//', views.delete_crawler, name='delete_crawler'), + + # grouped crawlers + path('new_group/', views.create_grouped_crawlers, name='create_grouped_crawlers'), + path('grouped_crawlers', views.list_grouped_crawlers, name='list_grouped_crawlers'), + + # misc + path('monitoring/', views.monitoring, name='monitoring'), + path('iframe/load', views.load_iframe, name='load_iframe'), + path('list_process', views.list_process, name='list_process'), + + # crawler queue + path('crawler_queue/', views.crawler_queue, name='crawler_queue'), + path('crawler_queue/remove/', views.remove_crawl_request_view, name='remove_crawl_request'), + + # scheduler + path('scheduler/', views.scheduler, name='scheduler'), +] \ No newline at end of file diff --git a/main/utils.py b/main/utils.py new file mode 100644 index 00000000..3bdd9f5a --- /dev/null +++ b/main/utils.py @@ -0,0 +1,262 @@ +from typing import Literal +from django.db import transaction +from django.utils import timezone + +import crawler_manager.crawler_manager as crawler_manager +from main.models import CrawlRequest, CrawlerInstance, CrawlerQueue, CrawlerQueueItem +from main.forms import ParameterHandlerFormSet, ResponseHandlerFormSet + +try: + CRAWLER_QUEUE = CrawlerQueue.object() + + # clears all items from the queue when starting the system + CrawlerQueueItem.objects.all().delete() + +except: + pass + +def create_instance(crawler_id, instance_id): + mother = CrawlRequest.objects.filter(id=crawler_id) + obj = CrawlerInstance.objects.create( + crawler=mother[0], instance_id=instance_id, running=True) + return obj + +def add_crawl_request(crawler_id, wait_on: Literal['last_position', 'first_position', 'no_wait'] = 'last_position'): + already_in_queue = CrawlerQueueItem.objects.filter(crawl_request_id=crawler_id).exists() + + if already_in_queue: + return + + crawl_request = CrawlRequest.objects.get(pk=crawler_id) + cr_expec_runtime_cat = crawl_request.expected_runtime_category + + if wait_on == 'no_wait': + queue_item = CrawlerQueueItem(crawl_request_id=crawler_id, + position=CrawlerQueueItem.NO_WAIT_POSITION, + running=True, + forced_execution=True, + queue_type=cr_expec_runtime_cat) + queue_item.save() + + else: + # The new element of the crawler queue must be in the correct position (after the last added) and + # in the correct queue: fast, normal or slow + + position = 0 + + if wait_on == 'first_position': + first_queue_item_created = CrawlerQueueItem.objects.filter( + queue_type=cr_expec_runtime_cat).order_by('position').first() + + if first_queue_item_created: + position = first_queue_item_created.position - 1 + + else: + last_queue_item_created = CrawlerQueueItem.objects.filter( + queue_type=cr_expec_runtime_cat).order_by('position').last() + if last_queue_item_created: + position = last_queue_item_created.position + 1 + + queue_item = CrawlerQueueItem(crawl_request_id=crawler_id, + position=position, + queue_type=cr_expec_runtime_cat) + queue_item.save() + +def unqueue_crawl_requests(queue_type: str): + crawlers_runnings = list() + has_items_from_another_queue, queue_items = CRAWLER_QUEUE.get_next(queue_type) + + for queue_item_id, crawler_id in queue_items: + instance = process_run_crawl(crawler_id) + + crawlers_runnings.append({ + 'crawler_id': crawler_id, + 'instance_id': instance.pk + }) + + queue_item = CrawlerQueueItem.objects.get(pk=queue_item_id) + queue_item.running = True + + # the crawlers from the another queue "will be insert in the queue with vacancy" + if has_items_from_another_queue: + queue_item.queue_type = queue_type + + queue_item.save() + + response = {'crawlers_added_to_run': crawlers_runnings} + return response + +def process_stop_crawl(crawler_id, from_sm_listener: bool = False): + instance = CrawlRequest.objects.filter( + id=crawler_id).get().running_instance + # instance = CrawlerInstance.objects.get(instance_id=instance_id) + + # No instance running + if instance is None: + raise ValueError("No instance running") + + if from_sm_listener and not instance.download_files_finished(): + instance.page_crawling_finished = True + instance.save() + + return + + instance_id = instance.instance_id + config = CrawlRequest.objects.filter(id=int(crawler_id)).values()[0] + + # FIXME: Colocar esse trecho de código no módulo writer + # computa o tamanho em kbytes do diretório "data" + # command_output = subprocess.run(["du " + config['data_path'] + "/data -d 0"], shell=True, stdout=subprocess.PIPE) + # output_line = command_output.stdout.decode('utf-8').strip('\n') + # parts = output_line.split('\t') + data_size_kbytes = 0 # int(parts[0]) + + # Get the number of files downloaded from the instance object + num_data_files = instance.number_files_success_download + + + instance = None + instance_info = {} + queue_type = None + + with transaction.atomic(): + # Execute DB commands atomically + instance = CrawlerInstance.objects.get(instance_id=instance_id) + instance.running = False + instance.finished_at = timezone.now() + instance.data_size_kbytes = data_size_kbytes + instance.num_data_files = num_data_files + instance.save() + + queue_item = CrawlerQueueItem.objects.get(crawl_request_id=crawler_id) + queue_type = queue_item.queue_type + queue_item.delete() + + # As soon as the instance is created, it starts to collect and is only modified when it stops, + # we use these fields to define when a collection started and ended + instance_info["started_at"] = str(instance.creation_date) + instance_info["finished_at"] = str(instance.last_modified) + instance_info["data_size_kbytes"] = data_size_kbytes + instance_info["num_data_files"] = num_data_files + + crawler_manager.update_instances_info( + config["data_path"], str(instance_id), instance_info) + + crawler_manager.stop_crawler(crawler_id) + + unqueue_crawl_requests(queue_type) + +def remove_crawl_request(crawler_id): + in_queue = CrawlerQueueItem.objects.filter(crawl_request_id=crawler_id).exists() + + if in_queue: + queue_item = CrawlerQueueItem.objects.get(crawl_request_id=crawler_id) + queue_item.delete() + +def process_run_crawl(crawler_id): + instance = None + instance_info = dict() + + crawler_entry = CrawlRequest.objects.filter(id=crawler_id) + data = crawler_entry.values()[0] + + # Instance already running + if crawler_entry.get().running: + instance_id = crawler_entry.get().running_instance.instance_id + raise ValueError("An instance is already running for this crawler " + f"({instance_id})") + + data = CrawlRequest.process_config_data(crawler_entry.get(), data) + + data["instance_id"] = crawler_manager.gen_key() + instance = create_instance(data['id'], data["instance_id"]) + crawler_manager.start_crawler(data.copy()) + + instance_info["started_at"] = str(instance.creation_date) + instance_info["finished_at"] = None + + crawler_manager.update_instances_info( + data["data_path"], str(data["instance_id"]), instance_info) + + return instance + +def get_all_crawl_requests(): + return CrawlRequest.objects.all().order_by('-last_modified') + +def get_all_crawl_requests_filtered(filter_crawler_id, filter_name, filter_base_url, filter_dynamic, filter_start_date, filter_end_date, filter_status): + filters_url = '' + all_crawlers = get_all_crawl_requests() + + if filter_crawler_id != '': + all_crawlers = all_crawlers.filter(id=filter_crawler_id) + filters_url += '&filter_crawler_id=' + filter_crawler_id + + if filter_name != '': + all_crawlers = all_crawlers.filter(source_name__icontains=filter_name) + filters_url += '&filter_name=' + filter_name + + if filter_base_url != '': + all_crawlers = all_crawlers.filter(base_url__exact=filter_base_url) + filters_url += '&filter_base_url=' + filter_base_url + + if filter_dynamic != '': + if filter_dynamic == '0': + all_crawlers = all_crawlers.filter(Q(dynamic_processing=0) | Q(dynamic_processing__isnull=True)) + if filter_dynamic == '1': + all_crawlers = all_crawlers.filter(dynamic_processing=1) + filters_url += '&filter_dynamic=' + filter_dynamic + + if filter_start_date != '': + all_crawlers = all_crawlers.filter(creation_date__gte=filter_start_date) + filters_url += '&filter_start_date=' + filter_start_date + + if filter_end_date != '': + all_crawlers = all_crawlers.filter(creation_date__lte=filter_end_date) + filters_url += '&filter_end_date=' + filter_end_date + + if filter_status != '': + if filter_status == 'running': + all_crawlers = all_crawlers.filter(instances__running=True).distinct() + + if filter_status == 'stopped': + all_crawlers = all_crawlers.filter(instances__running=False).distinct() + + if filter_status == 'queue_fast': + all_crawlers = all_crawlers.filter(crawlerqueueitem__isnull=False).select_related('crawlerqueueitem').filter( + crawlerqueueitem__running=False).filter(crawlerqueueitem__queue_type__exact='fast') + + if filter_status == 'queue_medium': + all_crawlers = all_crawlers.filter(crawlerqueueitem__isnull=False).select_related('crawlerqueueitem').filter( + crawlerqueueitem__running=False).filter(crawlerqueueitem__queue_type__exact='medium') + + if filter_status == 'queue_medium': + all_crawlers = all_crawlers.filter(crawlerqueueitem__isnull=False).select_related('crawlerqueueitem').filter( + crawlerqueueitem__running=False).filter(crawlerqueueitem__queue_type__exact='slow') + + filters_url += '&filter_status=' + filter_status + + return (all_crawlers, filters_url) + +def generate_injector_forms(*args, filter_queryset=False, **kwargs): + queryset = None + crawler = None + + if filter_queryset: + crawler = kwargs.get('instance') + + if crawler is None: + raise ValueError("If the filter_queryset option is True, the " + + "instance property must be set.") + + queryset = crawler.parameter_handlers + + parameter_formset = ParameterHandlerFormSet(*args, + prefix='templated_url-params', queryset=queryset, **kwargs) + + if filter_queryset: + queryset = crawler.response_handlers + + response_formset = ResponseHandlerFormSet(*args, + prefix='templated_url-responses', queryset=queryset, **kwargs) + + return parameter_formset, response_formset diff --git a/main/views.py b/main/views.py index 7f89de20..d1233a3a 100644 --- a/main/views.py +++ b/main/views.py @@ -1,317 +1,37 @@ -import base64 -import json import logging import multiprocessing as mp -import os -import time -import subprocess -from datetime import datetime -from typing_extensions import Literal -import crawler_manager.crawler_manager as crawler_manager -from crawler_manager.settings import (TASK_TOPIC, OUTPUT_FOLDER) -from crawler_manager.constants import * - -from django.conf import settings -from django.core.exceptions import ObjectDoesNotExist from django.core.paginator import Paginator -from django.db import transaction from django.db.models import Q -from django.http import (FileResponse, HttpResponse, HttpResponseNotFound, - HttpResponseRedirect, JsonResponse) +from django.http import HttpResponse, HttpResponseRedirect from django.shortcuts import get_object_or_404, redirect, render -from django.utils import timezone -from rest_framework import status, viewsets -from rest_framework.decorators import action -from rest_framework.response import Response -from .iframe_loader import iframe_loader -from .forms import (CrawlRequestForm, ParameterHandlerFormSet, - RawCrawlRequestForm, ResponseHandlerFormSet) -from .models import (CRAWLER_QUEUE_DB_ID, CrawlerInstance, CrawlerQueue, - CrawlerQueueItem, CrawlRequest, Log, Task) -from .serializers import (CrawlerInstanceSerializer, CrawlerQueueSerializer, - CrawlRequestSerializer, TaskSerializer) -from .task_filter import task_filter_by_date_interval +import crawler_manager.crawler_manager as crawler_manager +from crawler_manager.constants import * +from main.utils import (add_crawl_request, generate_injector_forms, + get_all_crawl_requests_filtered, process_stop_crawl, + remove_crawl_request, unqueue_crawl_requests) +from .forms import CrawlRequestForm, RawCrawlRequestForm +from .iframe_loader import iframe_loader +from .models import CrawlerQueueItem, CrawlRequest # Log the information to the file logger logger = logging.getLogger('file') -# Helper methods - -try: - CRAWLER_QUEUE = CrawlerQueue.object() - - # clears all items from the queue when starting the system - CrawlerQueueItem.objects.all().delete() - -except: - pass - - -def process_run_crawl(crawler_id): - instance = None - instance_info = dict() - - crawler_entry = CrawlRequest.objects.filter(id=crawler_id) - data = crawler_entry.values()[0] - - # Instance already running - if crawler_entry.get().running: - instance_id = crawler_entry.get().running_instance.instance_id - raise ValueError("An instance is already running for this crawler " - f"({instance_id})") - - data = CrawlRequest.process_config_data(crawler_entry.get(), data) - - data["instance_id"] = crawler_manager.gen_key() - instance = create_instance(data['id'], data["instance_id"]) - crawler_manager.start_crawler(data.copy()) - - instance_info["started_at"] = str(instance.creation_date) - instance_info["finished_at"] = None - - crawler_manager.update_instances_info( - data["data_path"], str(data["instance_id"]), instance_info) - - return instance - - -def add_crawl_request(crawler_id, wait_on: Literal['last_position', 'first_position', 'no_wait'] = 'last_position'): - already_in_queue = CrawlerQueueItem.objects.filter(crawl_request_id=crawler_id).exists() - - if already_in_queue: - return - - crawl_request = CrawlRequest.objects.get(pk=crawler_id) - cr_expec_runtime_cat = crawl_request.expected_runtime_category - - if wait_on == 'no_wait': - queue_item = CrawlerQueueItem(crawl_request_id=crawler_id, - position=CrawlerQueueItem.NO_WAIT_POSITION, - running=True, - forced_execution=True, - queue_type=cr_expec_runtime_cat) - queue_item.save() - - else: - # The new element of the crawler queue must be in the correct position (after the last added) and - # in the correct queue: fast, normal or slow - - position = 0 - - if wait_on == 'first_position': - first_queue_item_created = CrawlerQueueItem.objects.filter( - queue_type=cr_expec_runtime_cat).order_by('position').first() - - if first_queue_item_created: - position = first_queue_item_created.position - 1 - - else: - last_queue_item_created = CrawlerQueueItem.objects.filter( - queue_type=cr_expec_runtime_cat).order_by('position').last() - if last_queue_item_created: - position = last_queue_item_created.position + 1 - - queue_item = CrawlerQueueItem(crawl_request_id=crawler_id, - position=position, - queue_type=cr_expec_runtime_cat) - queue_item.save() - - -def remove_crawl_request(crawler_id): - in_queue = CrawlerQueueItem.objects.filter(crawl_request_id=crawler_id).exists() - - if in_queue: - queue_item = CrawlerQueueItem.objects.get(crawl_request_id=crawler_id) - queue_item.delete() - - def remove_crawl_request_view(request, crawler_id): remove_crawl_request(crawler_id) return redirect('/detail/' + str(crawler_id)) - -def unqueue_crawl_requests(queue_type: str): - crawlers_runnings = list() - has_items_from_another_queue, queue_items = CRAWLER_QUEUE.get_next(queue_type) - - for queue_item_id, crawler_id in queue_items: - instance = process_run_crawl(crawler_id) - - crawlers_runnings.append({ - 'crawler_id': crawler_id, - 'instance_id': instance.pk - }) - - queue_item = CrawlerQueueItem.objects.get(pk=queue_item_id) - queue_item.running = True - - # the crawlers from the another queue "will be insert in the queue with vacancy" - if has_items_from_another_queue: - queue_item.queue_type = queue_type - - queue_item.save() - - response = {'crawlers_added_to_run': crawlers_runnings} - return response - - -def process_stop_crawl(crawler_id, from_sm_listener: bool = False): - instance = CrawlRequest.objects.filter( - id=crawler_id).get().running_instance - # instance = CrawlerInstance.objects.get(instance_id=instance_id) - - # No instance running - if instance is None: - raise ValueError("No instance running") - - if from_sm_listener and not instance.download_files_finished(): - instance.page_crawling_finished = True - instance.save() - - return - - instance_id = instance.instance_id - config = CrawlRequest.objects.filter(id=int(crawler_id)).values()[0] - - # FIXME: Colocar esse trecho de código no módulo writer - # computa o tamanho em kbytes do diretório "data" - # command_output = subprocess.run(["du " + config['data_path'] + "/data -d 0"], shell=True, stdout=subprocess.PIPE) - # output_line = command_output.stdout.decode('utf-8').strip('\n') - # parts = output_line.split('\t') - data_size_kbytes = 0 # int(parts[0]) - - # Get the number of files downloaded from the instance object - num_data_files = instance.number_files_success_download - - - instance = None - instance_info = {} - queue_type = None - - with transaction.atomic(): - # Execute DB commands atomically - instance = CrawlerInstance.objects.get(instance_id=instance_id) - instance.running = False - instance.finished_at = timezone.now() - instance.data_size_kbytes = data_size_kbytes - instance.num_data_files = num_data_files - instance.save() - - queue_item = CrawlerQueueItem.objects.get(crawl_request_id=crawler_id) - queue_type = queue_item.queue_type - queue_item.delete() - - # As soon as the instance is created, it starts to collect and is only modified when it stops, - # we use these fields to define when a collection started and ended - instance_info["started_at"] = str(instance.creation_date) - instance_info["finished_at"] = str(instance.last_modified) - instance_info["data_size_kbytes"] = data_size_kbytes - instance_info["num_data_files"] = num_data_files - - crawler_manager.update_instances_info( - config["data_path"], str(instance_id), instance_info) - - crawler_manager.stop_crawler(crawler_id) - - unqueue_crawl_requests(queue_type) - - def list_process(request): text = '' for p in mp.active_children(): text += f"child {p.name} is PID {p.pid}
    " - return HttpResponse(text) - def crawler_queue(request): return render(request, 'main/crawler_queue.html') - -def getAllData(): - return CrawlRequest.objects.all().order_by('-last_modified') - - -def getAllDataFiltered(filter_crawler_id, filter_name, filter_base_url, filter_dynamic, filter_start_date, filter_end_date, filter_status): - filters_url = '' - all_crawlers = getAllData() - - if filter_crawler_id != '': - all_crawlers = all_crawlers.filter(id=filter_crawler_id) - filters_url += '&filter_crawler_id=' + filter_crawler_id - if filter_name != '': - all_crawlers = all_crawlers.filter(source_name__icontains=filter_name) - filters_url += '&filter_name=' + filter_name - if filter_base_url != '': - all_crawlers = all_crawlers.filter(base_url__exact=filter_base_url) - filters_url += '&filter_base_url=' + filter_base_url - if filter_dynamic != '': - if filter_dynamic == '0': - all_crawlers = all_crawlers.filter(Q(dynamic_processing=0) | Q(dynamic_processing__isnull=True)) - if filter_dynamic == '1': - all_crawlers = all_crawlers.filter(dynamic_processing=1) - filters_url += '&filter_dynamic=' + filter_dynamic - if filter_start_date != '': - all_crawlers = all_crawlers.filter(creation_date__gte=filter_start_date) - filters_url += '&filter_start_date=' + filter_start_date - if filter_end_date != '': - all_crawlers = all_crawlers.filter(creation_date__lte=filter_end_date) - filters_url += '&filter_end_date=' + filter_end_date - if filter_status != '': - if filter_status == 'running': - all_crawlers = all_crawlers.filter(instances__running=True).distinct() - if filter_status == 'stopped': - all_crawlers = all_crawlers.filter(instances__running=False).distinct() - if filter_status == 'queue_fast': - all_crawlers = all_crawlers.filter(crawlerqueueitem__isnull=False).select_related('crawlerqueueitem').filter( - crawlerqueueitem__running=False).filter(crawlerqueueitem__queue_type__exact='fast') - if filter_status == 'queue_medium': - all_crawlers = all_crawlers.filter(crawlerqueueitem__isnull=False).select_related('crawlerqueueitem').filter( - crawlerqueueitem__running=False).filter(crawlerqueueitem__queue_type__exact='medium') - if filter_status == 'queue_medium': - all_crawlers = all_crawlers.filter(crawlerqueueitem__isnull=False).select_related('crawlerqueueitem').filter( - crawlerqueueitem__running=False).filter(crawlerqueueitem__queue_type__exact='slow') - filters_url += '&filter_status=' + filter_status - - return (all_crawlers, filters_url) - - -def create_instance(crawler_id, instance_id): - mother = CrawlRequest.objects.filter(id=crawler_id) - obj = CrawlerInstance.objects.create( - crawler=mother[0], instance_id=instance_id, running=True) - return obj - - -def generate_injector_forms(*args, filter_queryset=False, **kwargs): - queryset = None - crawler = None - if filter_queryset: - crawler = kwargs.get('instance') - - if crawler is None: - raise ValueError("If the filter_queryset option is True, the " + - "instance property must be set.") - - queryset = crawler.parameter_handlers - - parameter_formset = ParameterHandlerFormSet(*args, - prefix='templated_url-params', queryset=queryset, **kwargs) - - if filter_queryset: - queryset = crawler.response_handlers - response_formset = ResponseHandlerFormSet(*args, - prefix='templated_url-responses', queryset=queryset, **kwargs) - - return parameter_formset, response_formset - - -# Views - - def list_crawlers(request): page_number = request.GET.get('page', 1) filter_crawler_id = request.GET.get('filter_crawler_id', '') @@ -323,7 +43,7 @@ def list_crawlers(request): filter_status = request.GET.get('filter_status', '') - all_crawlers, filters_url = getAllDataFiltered(filter_crawler_id, + all_crawlers, filters_url = get_all_crawl_requests_filtered(filter_crawler_id, filter_name, filter_base_url, filter_dynamic, @@ -345,8 +65,8 @@ def list_crawlers(request): 'filter_status': filter_status, 'filters_url': filters_url, } - return render(request, "main/list_crawlers.html", context) + return render(request, "main/list_crawlers.html", context) def get_grouped_crawlers_filtered(filter_crawler_id, filter_dynamic, filter_start_date, filter_end_date): filters_url = '' @@ -355,15 +75,20 @@ def get_grouped_crawlers_filtered(filter_crawler_id, filter_dynamic, filter_star if filter_crawler_id != '': filters_url += '&filter_crawler_id=' + filter_crawler_id filter_string += f" and X.id = {filter_crawler_id}" + if filter_dynamic != '': filters_url += '&filter_dynamic=' + filter_dynamic + if int(filter_dynamic) == 1: filter_string += " and dynamic_processing = TRUE" + else: filter_string += " and (dynamic_processing = FALSE or dynamic_processing IS NULL)" + if filter_start_date != '': filters_url += '&filter_start_date=' + filter_start_date filter_string += f" and date(creation_date) >= '{filter_start_date}'" + if filter_end_date != '': filters_url += '&filter_end_date=' + filter_end_date filter_string += f" and date(creation_date) <= '{filter_end_date}'" @@ -406,27 +131,6 @@ def list_grouped_crawlers(request): return render(request, "main/list_grouped_crawlers.html", context) -def get_crawlers_from_same_group(request, crawler_id): - crawlers = CrawlRequest.objects.raw( - "select id, source_name \ - from main_crawlrequest \ - where steps=( \ - select steps from main_crawlrequest where id = "+str(crawler_id)+") order by id desc") - - json_data = [] - for item in crawlers: - json_data.append({ - 'id': item.id, - 'source_name': item.source_name, - 'last_modified': item.last_modified, - 'base_url': item.base_url, - }) - - json_data = json.dumps(json_data, default=str) - - return JsonResponse(json_data, safe=False) - - def create_crawler(request): context = {} @@ -452,8 +156,8 @@ def create_crawler(request): context['form'] = my_form context['templated_response_formset'] = templated_response_formset context['templated_parameter_formset'] = templated_parameter_formset - return render(request, "main/create_crawler.html", context) + return render(request, "main/create_crawler.html", context) def create_grouped_crawlers(request): context = {} @@ -500,8 +204,8 @@ def create_grouped_crawlers(request): context['templated_parameter_formset'] = templated_parameter_formset context['crawler_types'] = CrawlRequest.CRAWLERS_TYPES context['page_context'] = 'new_group' - return render(request, "main/create_grouped_crawlers.html", context) + return render(request, "main/create_grouped_crawlers.html", context) def edit_crawler(request, crawler_id): crawler = get_object_or_404(CrawlRequest, pk=crawler_id) @@ -517,16 +221,15 @@ def edit_crawler(request, crawler_id): form.save() templated_parameter_formset.save() templated_response_formset.save() - return redirect(detail_crawler, crawler_id=crawler_id) - else: - return render(request, 'main/create_crawler.html', { - 'form': form, - 'templated_response_formset': templated_response_formset, - 'templated_parameter_formset': templated_parameter_formset, - 'crawler': crawler - }) + return redirect(detail_crawler, crawler_id=crawler_id) + return render(request, 'main/create_crawler.html', { + 'form': form, + 'templated_response_formset': templated_response_formset, + 'templated_parameter_formset': templated_parameter_formset, + 'crawler': crawler + }) def edit_grouped_crawlers(request, id): # busca pelo crawler que representa o grupo (pra ajudar a preencher o form) @@ -539,7 +242,6 @@ def edit_grouped_crawlers(request, id): where steps=( \ select steps from main_crawlrequest where id = "+str(id)+") order by id desc") - form = RawCrawlRequestForm(request.POST or None, instance=crawler) templated_parameter_formset, templated_response_formset = \ generate_injector_forms(request.POST or None, filter_queryset=True, @@ -589,7 +291,6 @@ def edit_grouped_crawlers(request, id): return render(request, 'main/create_grouped_crawlers.html', context) - def delete_crawler(request, crawler_id): crawler = CrawlRequest.objects.get(id=crawler_id) @@ -603,7 +304,6 @@ def delete_crawler(request, crawler_id): {'crawler': crawler} ) - def detail_crawler(request, crawler_id): crawler = CrawlRequest.objects.get(id=crawler_id) # order_by("-atribute") orders descending @@ -623,21 +323,17 @@ def detail_crawler(request, crawler_id): return render(request, 'main/detail_crawler.html', context) - def monitoring(request): return HttpResponseRedirect("http://localhost:5000/") - def create_steps(request): return render(request, "main/steps_creation.html", {}) - def stop_crawl(request, crawler_id): from_sm_listener = request.GET.get('from', '') == 'sm_listener' process_stop_crawl(crawler_id, from_sm_listener) return redirect(detail_crawler, crawler_id=crawler_id) - def run_crawl(request, crawler_id): add_crawl_request(crawler_id) @@ -648,307 +344,9 @@ def run_crawl(request, crawler_id): return redirect(detail_crawler, crawler_id=crawler_id) - -def tail_log_file(request, instance_id): - instance = CrawlerInstance.objects.get(instance_id=instance_id) - - files_found = instance.number_files_found - download_file_success = instance.number_files_success_download - download_file_error = instance.number_files_error_download - number_files_previously_crawled = instance.number_files_previously_crawled - - pages_found = instance.number_pages_found - download_page_success = instance.number_pages_success_download - download_page_error = instance.number_pages_error_download - number_pages_duplicated_download = instance.number_pages_duplicated_download - number_pages_previously_crawled = instance.number_pages_previously_crawled - - config = CrawlRequest.objects.filter(id=int(instance.crawler.id)).values()[0] - data_path = os.path.join(OUTPUT_FOLDER, config["data_path"]) - - out = subprocess.run(["tail", - f"{data_path}/{instance_id}/log/{instance_id}.out", - "-n", - "20"], - stdout=subprocess.PIPE).stdout - err = subprocess.run(["tail", - f"{data_path}/{instance_id}/log/{instance_id}.err", - "-n", - "20"], - stdout=subprocess.PIPE).stdout - - return JsonResponse({ - "files_found": files_found, - "files_success": download_file_success, - "files_error": download_file_error, - "files_previously_crawled": number_files_previously_crawled, - - "pages_found": pages_found, - "pages_success": download_page_success, - "pages_error": download_page_error, - "pages_duplicated": number_pages_duplicated_download, - "pages_previously_crawled": number_pages_previously_crawled, - - "out": out.decode('utf-8'), - "err": err.decode('utf-8'), - "time": str(datetime.fromtimestamp(time.time())), - }) - - -def raw_log_out(request, instance_id): - instance = CrawlerInstance.objects.get(instance_id=instance_id) - - config = CrawlRequest.objects.filter(id=int(instance.crawler.id)).values()[0] - data_path = os.path.join(OUTPUT_FOLDER, config["data_path"]) - - out = subprocess.run(["tail", - f"{data_path}/{instance_id}/log/{instance_id}.out", - "-n", - "100"], - stdout=subprocess.PIPE).stdout - - raw_text = out.decode('utf-8') - raw_results = raw_text.splitlines(True) - resp = JsonResponse({str(instance_id): raw_results}, - json_dumps_params={'indent': 2}) - - if len(raw_results) > 0 and instance.running: - resp['Refresh'] = 5 - return resp - -def raw_log_err(request, instance_id): - instance = CrawlerInstance.objects.get(instance_id=instance_id) - - config = CrawlRequest.objects.filter(id=int(instance.crawler.id)).values()[0] - data_path = os.path.join(OUTPUT_FOLDER, config["data_path"]) - - err = subprocess.run(["tail", - f"{data_path}/{instance_id}/log/{instance_id}.err", - "-n", - "100"], - stdout=subprocess.PIPE).stdout - - raw_text = err.decode('utf-8') - raw_results = raw_text.splitlines(True) - resp = JsonResponse({str(instance_id): raw_results}, - json_dumps_params={'indent': 2}) - - if len(raw_results) > 0 and instance.running: - resp['Refresh'] = 5 - return resp - - -def files_found(request, instance_id, num_files): - try: - instance = CrawlerInstance.objects.get(instance_id=instance_id) - - instance.number_files_found += num_files - instance.save() - - return JsonResponse({}, status=status.HTTP_200_OK) - - except Exception as e: - return JsonResponse({}, status=status.HTTP_400_BAD_REQUEST) - - -def success_download_file(request, instance_id): - try: - instance = CrawlerInstance.objects.get(instance_id=instance_id) - - instance.number_files_success_download += 1 - instance.save() - - if instance.page_crawling_finished and instance.download_files_finished(): - process_stop_crawl(instance.crawler.id) - - return JsonResponse({}, status=status.HTTP_200_OK) - - except: - return JsonResponse({}, status=status.HTTP_400_BAD_REQUEST) - -def previously_crawled_file(request, instance_id): - try: - instance = CrawlerInstance.objects.get(instance_id=instance_id) - - instance.number_files_previously_crawled += 1 - instance.save() - - if instance.page_crawling_finished and instance.download_files_finished(): - process_stop_crawl(instance.crawler.id) - - return JsonResponse({}, status=status.HTTP_200_OK) - - except: - return JsonResponse({}, status=status.HTTP_400_BAD_REQUEST) - - -def error_download_file(request, instance_id): - try: - instance = CrawlerInstance.objects.get(instance_id=instance_id) - - instance.number_files_error_download += 1 - instance.save() - - if instance.page_crawling_finished and instance.download_files_finished(): - process_stop_crawl(instance.crawler.id) - - return JsonResponse({}, status=status.HTTP_200_OK) - - except: - return JsonResponse({}, status=status.HTTP_400_BAD_REQUEST) - - -def pages_found(request, instance_id, num_pages): - try: - instance = CrawlerInstance.objects.get(instance_id=instance_id) - - instance.number_pages_found += num_pages - instance.save() - - return JsonResponse({}, status=status.HTTP_200_OK) - - except Exception as e: - return JsonResponse({}, status=status.HTTP_400_BAD_REQUEST) - - -def success_download_page(request, instance_id): - try: - instance = CrawlerInstance.objects.get(instance_id=instance_id) - - instance.number_pages_success_download += 1 - instance.save() - - return JsonResponse({}, status=status.HTTP_200_OK) - - except: - return JsonResponse({}, status=status.HTTP_400_BAD_REQUEST) - -def previously_crawled_page(request, instance_id): - try: - instance = CrawlerInstance.objects.get(instance_id=instance_id) - - instance.number_pages_previously_crawled += 1 - instance.save() - - return JsonResponse({}, status=status.HTTP_200_OK) - - except: - return JsonResponse({}, status=status.HTTP_400_BAD_REQUEST) - -def error_download_page(request, instance_id): - try: - instance = CrawlerInstance.objects.get(instance_id=instance_id) - - instance.number_pages_error_download += 1 - instance.save() - - return JsonResponse({}, status=status.HTTP_200_OK) - - except: - return JsonResponse({}, status=status.HTTP_400_BAD_REQUEST) - - -def duplicated_download_page(request, instance_id): - try: - instance = CrawlerInstance.objects.get(instance_id=instance_id) - - instance.number_pages_duplicated_download += 1 - instance.save() - - return JsonResponse({}, status=status.HTTP_200_OK) - - except: - return JsonResponse({}, status=status.HTTP_400_BAD_REQUEST) - - def downloads(request): return render(request, "main/downloads.html") - -def export_config(request, instance_id): - instance = get_object_or_404(CrawlerInstance, pk=instance_id) - data_path = instance.crawler.data_path - - file_name = f"{instance_id}.json" - rel_path = os.path.join(data_path, str(instance_id), "config", file_name) - path = os.path.join(settings.OUTPUT_FOLDER, rel_path) - - try: - response = FileResponse(open(path, 'rb'), content_type='application/json') - except FileNotFoundError: - print(f"Arquivo de Configuração Não Encontrado: {file_name}") - return HttpResponseNotFound("

    Página Não Encontrada

    ") - else: - response['Content-Length'] = os.path.getsize(path) - response['Content-Disposition'] = "attachment; filename=%s" % file_name - - return response - -def export_trace(request, instance_id): - instance = get_object_or_404(CrawlerInstance, pk=instance_id) - data_path = instance.crawler.data_path - - file_name = f"{instance_id}.zip" - rel_path = os.path.join(data_path, str(instance_id), "debug", "trace", file_name) - path = os.path.join(settings.OUTPUT_FOLDER, rel_path) - - try: - response = FileResponse(open(path, 'rb'), content_type='zip') - except FileNotFoundError: - print(f"Arquivo Trace Não Encontrado: {file_name}. Verifique se a opção de gerar arquivo trace foi habilitada na configuração do coletor.") - return HttpResponseNotFound("

    Página Não Encontrada

    Verifique se a opção de gerar arquivo trace foi habilitada na configuração do coletor.

    ") - else: - response['Content-Length'] = os.path.getsize(path) - response['Content-Disposition'] = "attachment; filename=%s" % file_name - - return response - -def view_screenshots(request, instance_id, page): - IMGS_PER_PAGE = 20 - - instance = get_object_or_404(CrawlerInstance, pk=instance_id) - - output_folder = os.getenv('OUTPUT_FOLDER', '/data') - data_path = instance.crawler.data_path - instance_path = os.path.join(output_folder, data_path, str(instance_id)) - - screenshot_dir = os.path.join(instance_path, "data", "screenshots") - - if not os.path.isdir(screenshot_dir): - return JsonResponse({ - 'error': 'Pasta de coleta não encontrada.', - 'total_screenshots': 0 - }, status=200) - - screenshot_list = sorted(os.listdir(screenshot_dir)) - total_screenshots = len(screenshot_list) - - if total_screenshots == 0: - return JsonResponse({ - 'error': 'Nenhum screenshot encontrado.', - 'total_screenshots': 0 - }, status=200) - - screenshot_list = screenshot_list[(page - 1) * IMGS_PER_PAGE: - page * IMGS_PER_PAGE] - - image_data = [] - for index, screenshot in enumerate(screenshot_list): - img_path = os.path.join(screenshot_dir, screenshot) - with open(img_path, "rb") as image: - curr_img = { - 'base64': base64.b64encode(image.read()).decode('ascii'), - 'title': str(1 + index + ((page - 1) * IMGS_PER_PAGE)) - } - image_data.append(curr_img) - - - return JsonResponse({ - 'data': image_data, - 'total_screenshots': total_screenshots - }, status=200) - - def load_iframe(request): url = request.GET['url'].replace('"', '') xpath = request.GET['xpath'].replace('"', '') @@ -965,352 +363,6 @@ def load_iframe(request): } return render(request, 'main/error_iframe_loader.html', ctx) - def scheduler(request): crawl_requests = CrawlRequest.objects.all() - context = { - 'crawl_requests': crawl_requests - } - return render(request, 'main/scheduler.html', context) - -# API -######## - - -""" -API endpoints: -METHOD ENDPOINT DESCRIPTION -GET /api/ API description -GET /api/crawlers crawler list -POST /api/crawlers create crawler -GET /api/crawlers/ crawler detail -PUT /api/crawlers/ update crawler data -PATCH /api/crawlers/ partially update crawler data -DELETE /api/crawlers/ delete crawler -GET /api/crawlers//run run crawler instance -GET /api/crawlers//stop stop crawler instance -GET /api/instances/ list crawler instances -GET /api/instances/ crawler instance detail -GET /api/downloads/ return details about download itens -GET /api/downloads/ return list of download itens -POST /api/downloads/ create a download item -PUT /api/downloads/ update download item -GET /api/downloads/queue return list of items in download queue -GET /api/downloads/progress return info about current download -GET /api/downloads/error return info about download errors -""" - - -class CrawlerViewSet(viewsets.ModelViewSet): - """ - ViewSet that allows crawlers to be viewed, edited, updated and removed. - """ - queryset = CrawlRequest.objects.all().order_by('-creation_date') - serializer_class = CrawlRequestSerializer - - def _create_templated_url_parameter_handlers(self, parameter_handlers, crawler_id): - for handler in parameter_handlers: - handler['crawler_id'] = crawler_id - handler['injection_type'] = 'templated_url' - ParameterHandler.objects.create(**handler) - - def _create_templated_url_response_handlers(self, response_handlers, crawler_id): - for handler in response_handlers: - handler['crawler_id'] = crawler_id - handler['injection_type'] = 'templated_url' - ResponseHandler.objects.create(**handler) - - def create(self, request, *args, **kwargs): - """ - Create a new crawler. - """ - data = request.data - - if type(data) is not dict: - data = data.dict() - - templated_url_parameter_handlers = data.pop('templated_url_parameter_handlers', []) - templated_url_response_handlers = data.pop('templated_url_response_handlers', []) - - serializer = CrawlRequestSerializer(data=request.data) - if serializer.is_valid(): - with transaction.atomic(): - serializer.save() - - crawler_id = serializer.data['id'] - - self._create_templated_url_parameter_handlers(templated_url_parameter_handlers, crawler_id) - self._create_templated_url_response_handlers(templated_url_response_handlers, crawler_id) - - return Response(serializer.data, status=status.HTTP_201_CREATED) - return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST) - - @action(detail=True, methods=['get']) - def run(self, request, pk): - query_params = self.request.query_params.dict() - action = query_params.get('action', '') - - if action == 'run_immediately': - wait_on = 'no_wait' - - add_crawl_request(pk, wait_on) - instance = process_run_crawl(pk) - data = { - 'status': settings.API_SUCCESS, - 'instance': CrawlerInstanceSerializer(instance).data - } - - return JsonResponse(data) - - elif action == 'wait_on_first_queue_position': - wait_on = 'first_position' - - else: - wait_on = 'last_position' - - try: - add_crawl_request(pk, wait_on) - - crawl_request = CrawlRequest.objects.get(pk=pk) - queue_type = crawl_request.expected_runtime_category - - unqueue_crawl_requests(queue_type) - - except Exception as e: - data = { - 'status': settings.API_ERROR, - 'message': str(e) - } - return JsonResponse(data) - - if wait_on == 'first_position': - message = f'Crawler added to crawler queue in first position' - - else: - message = f'Crawler added to crawler queue in last position' - - data = { - 'status': settings.API_SUCCESS, - 'message': message - } - - return JsonResponse(data) - - @action(detail=True, methods=['get']) - def stop(self, request, pk): - instance = None - try: - instance = process_stop_crawl(pk) - - except Exception as e: - data = { - 'status': settings.API_ERROR, - 'message': str(e) - } - return JsonResponse(data,) - - data = { - 'status': settings.API_SUCCESS, - 'instance': CrawlerInstanceSerializer(instance).data - } - return JsonResponse(data) - -class CrawlerInstanceViewSet(viewsets.ReadOnlyModelViewSet): - """ - A simple ViewSet for viewing and listing instances - """ - queryset = CrawlerInstance.objects.all() - serializer_class = CrawlerInstanceSerializer - - -class CrawlerQueueViewSet(viewsets.ModelViewSet): - queryset = CrawlerQueue.objects.all() - serializer_class = CrawlerQueueSerializer - http_method_names = ['get', 'put'] - - def list(self, request): - return self.retrieve(request) - - def retrieve(self, request, pk=None): - crawler_queue = CrawlerQueue.to_dict() - return Response(crawler_queue) - - @action(detail=True, methods=['get']) - def switch_position(self, request, pk): - a = request.GET['a'] - b = request.GET['b'] - - with transaction.atomic(): - try: - queue_item_a = CrawlerQueueItem.objects.get(pk=a) - - except ObjectDoesNotExist: - return JsonResponse({'message': f'Crawler queue item {a} not found!'}, status=status.HTTP_404_NOT_FOUND) - - try: - queue_item_b = CrawlerQueueItem.objects.get(pk=b) - - except ObjectDoesNotExist: - return JsonResponse({'message': f'Crawler queue item {b} not found!'}, status=status.HTTP_404_NOT_FOUND) - - if queue_item_a.queue_type != queue_item_b.queue_type: - return JsonResponse({'message': 'Crawler queue items must be in same queue!'}, status=status.HTTP_400_BAD_REQUEST) - - position_aux = queue_item_a.position - - queue_item_a.position = queue_item_b.position - queue_item_b.position = position_aux - - queue_item_a.save() - queue_item_b.save() - - return JsonResponse({'message': 'success'}, status=status.HTTP_200_OK) - - @action(detail=True, methods=['get']) - def force_execution(self, request, pk): - queue_item_id = request.GET['queue_item_id'] - - with transaction.atomic(): - try: - queue_item = CrawlerQueueItem.objects.get(pk=queue_item_id) - - except ObjectDoesNotExist: - return JsonResponse({'message': f'Crawler queue item {queue_item_id} not found!'}, status=status.HTTP_404_NOT_FOUND) - - crawler_id = queue_item.crawl_request.id - - instance = process_run_crawl(crawler_id) - - queue_item.forced_execution = True - queue_item.running = True - queue_item.save() - - data = { - 'crawler_id': crawler_id, - 'instance_id': instance.pk - } - - return JsonResponse(data, status=status.HTTP_200_OK) - - @action(detail=True, methods=['get']) - def remove_item(self, request, pk): - queue_item_id = request.GET['queue_item_id'] - - try: - queue_item = CrawlerQueueItem.objects.get(pk=queue_item_id) - queue_item.delete() - - except ObjectDoesNotExist: - return JsonResponse({'message': f'Crawler queue item {queue_item_id} not found!'}, status=status.HTTP_404_NOT_FOUND) - - return Response(status=status.HTTP_204_NO_CONTENT) - - def update(self, request, pk=None): - response = super().update(request, pk=CRAWLER_QUEUE_DB_ID) - - # updade crawler queue instance with new configs - global CRAWLER_QUEUE - CRAWLER_QUEUE = CrawlerQueue.object() - - # the size of queue of type fast changed, may is possible run - # more crawlers - if 'max_fast_runtime_crawlers_running' in request.data: - unqueue_crawl_requests('fast') - - # the size of queue of type normal changed, may is possible run - # more crawlers - if 'max_medium_runtime_crawlers_running' in request.data: - unqueue_crawl_requests('medium') - - # the size of queue of type slow changed, may is possible run - # more crawlers - if 'max_slow_runtime_crawlers_running' in request.data: - unqueue_crawl_requests('slow') - - return response - - -class TaskViewSet(viewsets.ModelViewSet): - queryset = Task.objects.all() - serializer_class = TaskSerializer - - def create(self, request): - response = super().create(request) - if response.status_code == status.HTTP_201_CREATED: - message = { - 'action': 'create', - 'data': response.data - } - crawler_manager.message_sender.send(TASK_TOPIC, message) - return response - - def update(self, request, pk=None): - response = super().update(request, pk=pk) - if response.status_code == status.HTTP_200_OK: - message = { - 'action': 'update', - 'data': response.data - } - crawler_manager.message_sender.send(TASK_TOPIC, message) - return response - - def partial_update(self, request, pk=None): - response = super().partial_update(request, pk=pk) - if response.status_code == status.HTTP_200_OK: - message = { - 'action': 'update', - 'data': response.data - } - crawler_manager.message_sender.send(TASK_TOPIC, message) - return response - - def destroy(self, request, pk=None): - response = super().destroy(request, pk=pk) - if response.status_code == status.HTTP_204_NO_CONTENT: - message = { - 'action': 'cancel', - 'data': { - 'id': pk - } - } - crawler_manager.message_sender.send(TASK_TOPIC, message) - return response - - def __str2date(self, s: str) -> datetime: - date = None - - try: - date = datetime.strptime(s, '%d-%m-%Y') - - except Exception as e: - print(e) - - return date - - @action(detail=False) - def filter(self, request): - query_params = self.request.query_params.dict() - - end_date = None - start_date = None - - if 'end_date' in query_params: - end_date = self.__str2date(query_params['end_date']) - - start_date = None - if 'start_date' in query_params: - start_date = self.__str2date(query_params['start_date']) - if end_date is None or start_date is None: - msg = {'message': 'You must send the params start_date and end_date, both in the format day-month-year' + - ' in the query params of the url. Eg.: ?start_date=23-04-2023&end_date=01-01-2020, etc.'} - - return Response(msg, status=status.HTTP_400_BAD_REQUEST) - - queryset = self.get_queryset() - serializer = self.get_serializer(queryset, many=True) - - # serializer.data is ordered_dict - tasks = json.loads(json.dumps(serializer.data)) - data = task_filter_by_date_interval(tasks, start_date, end_date) - - return Response(data, status=status.HTTP_200_OK) + return render(request, 'main/scheduler.html', {'crawl_requests': crawl_requests}) \ No newline at end of file From 941b1691af4e56e4f297612f3fc3dab74d641b16 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Mon, 5 Jun 2023 17:23:59 -0300 Subject: [PATCH 70/89] Download file from instance api refactored --- api/urls.py | 4 ++ api/views/crawler_instance.py | 50 ++++++++++++++++++- api/views/status/file_download.py | 80 ------------------------------- 3 files changed, 53 insertions(+), 81 deletions(-) delete mode 100644 api/views/status/file_download.py diff --git a/api/urls.py b/api/urls.py index 8b938e50..653dff98 100644 --- a/api/urls.py +++ b/api/urls.py @@ -31,6 +31,10 @@ path('instance/', views.CrawlerInstanceViewSet.as_view(only_list_action), name='instance'), path('instance/', views.CrawlerInstanceViewSet.as_view(only_retrieve_action), name='instance-detail'), path('instance//export_config', views.CrawlerInstanceViewSet.as_view({'get': 'export_config'}), name='instance-export-config'), + path('instance//file/found/', views.CrawlerInstanceViewSet.as_view({'get': 'files_found'}), name='instance-files-found'), + path('instance//file/success', views.CrawlerInstanceViewSet.as_view({'get': 'success_download_file'}), name='instance-success-download-file'), + path('instance//file/error', views.CrawlerInstanceViewSet.as_view({'get': 'error_download_file'}), name='instance-error-download-file'), + path('instance//file/duplicated', views.CrawlerInstanceViewSet.as_view({'get': 'previously_crawled_file'}), name='instance-duplicated-download-file'), path('task/', views.TaskViewSet.as_view(list_and_create_actions), name='task'), path('task/', views.TaskViewSet.as_view(retrieve_update_and_destroy_actions), name='task-detail'), diff --git a/api/views/crawler_instance.py b/api/views/crawler_instance.py index 0066ce1b..0889d6b1 100644 --- a/api/views/crawler_instance.py +++ b/api/views/crawler_instance.py @@ -5,8 +5,10 @@ from rest_framework.response import Response from rest_framework import status from django.http import FileResponse +from typing_extensions import Literal from main.models import CrawlerInstance +from main.utils import process_stop_crawl from main.serializers import CrawlerInstanceSerializer from crawler_manager.settings import OUTPUT_FOLDER @@ -41,4 +43,50 @@ def export_config(self, request, pk): response['Content-Length'] = os.path.getsize(path) response['Content-Disposition'] = 'attachment; filename=%s' % file_name - return response \ No newline at end of file + return response + + def _update_file_info(self, instance_id, operation: Literal['found', 'success', 'error', 'duplicated'], val: int = 1): + try: + instance = CrawlerInstance.objects.get(instance_id=instance_id) + + except: + return Response(status=status.HTTP_404_NOT_FOUND) + + try: + if operation == 'found': + instance.number_files_found += val + + elif operation == 'success': + instance.number_files_success_download += val + + elif operation == 'error': + instance.number_files_error_download += val + + elif operation == 'duplicated': + instance.number_files_previously_crawled += val + + instance.save() + + if instance.page_crawling_finished and instance.download_files_finished(): + process_stop_crawl(instance.crawler.id) + + return Response(status=status.HTTP_200_OK) + + except Exception as e: + return Response({'error': str(e)}, status=status.HTTP_400_BAD_REQUEST) + + @action(detail=True, methods=['get']) + def files_found(self, request, pk, num_files): + return self._update_file_info(pk, 'found', num_files) + + @action(detail=True, methods=['get']) + def success_download_file(self, request, pk): + return self._update_file_info(pk, 'success') + + @action(detail=True, methods=['get']) + def previously_crawled_file(self, request, pk): + return self._update_file_info(pk, 'duplicated') + + @action(detail=True, methods=['get']) + def error_download_file(self, request, pk): + return self._update_file_info(pk, 'error') \ No newline at end of file diff --git a/api/views/status/file_download.py b/api/views/status/file_download.py deleted file mode 100644 index 56f63a14..00000000 --- a/api/views/status/file_download.py +++ /dev/null @@ -1,80 +0,0 @@ -from rest_framework import status -from rest_framework.response import Response - -from main.models import CrawlerInstance -from main.utils import process_stop_crawl - -def files_found(request, instance_id, num_files): - try: - instance = CrawlerInstance.objects.get(instance_id=instance_id) - - except: - return Response(status=status.HTTP_404_NOT_FOUND) - - try: - instance.number_files_found += num_files - instance.save() - - return Response(status=status.HTTP_200_OK) - - except Exception as e: - return Response({'error': str(e)}, status=status.HTTP_400_BAD_REQUEST) - - -def success_download_file(request, instance_id): - try: - instance = CrawlerInstance.objects.get(instance_id=instance_id) - - except: - return Response(status=status.HTTP_404_NOT_FOUND) - - try: - instance.number_files_success_download += 1 - instance.save() - - if instance.page_crawling_finished and instance.download_files_finished(): - process_stop_crawl(instance.crawler.id) - - return Response(status=status.HTTP_200_OK) - - except Exception as e: - return Response({'error': str(e)}, status=status.HTTP_500_INTERNAL_SERVER_ERROR) - -def previously_crawled_file(request, instance_id): - try: - instance = CrawlerInstance.objects.get(instance_id=instance_id) - - except: - return Response(status=status.HTTP_404_NOT_FOUND) - - try: - instance.number_files_previously_crawled += 1 - instance.save() - - if instance.page_crawling_finished and instance.download_files_finished(): - process_stop_crawl(instance.crawler.id) - - return Response({}, status=status.HTTP_200_OK) - - except: - return Response({}, status=status.HTTP_400_BAD_REQUEST) - - -def error_download_file(request, instance_id): - try: - instance = CrawlerInstance.objects.get(instance_id=instance_id) - - except: - return Response(status=status.HTTP_404_NOT_FOUND) - - try: - instance.number_files_error_download += 1 - instance.save() - - if instance.page_crawling_finished and instance.download_files_finished(): - process_stop_crawl(instance.crawler.id) - - return Response({}, status=status.HTTP_200_OK) - - except: - return Response({}, status=status.HTTP_400_BAD_REQUEST) \ No newline at end of file From 187911e620c7f8b39b6c1a980e040e54a32022b0 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Mon, 5 Jun 2023 17:25:39 -0300 Subject: [PATCH 71/89] Better method's name --- api/urls.py | 6 +++--- api/views/crawler_instance.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/api/urls.py b/api/urls.py index 653dff98..15965fd7 100644 --- a/api/urls.py +++ b/api/urls.py @@ -32,9 +32,9 @@ path('instance/', views.CrawlerInstanceViewSet.as_view(only_retrieve_action), name='instance-detail'), path('instance//export_config', views.CrawlerInstanceViewSet.as_view({'get': 'export_config'}), name='instance-export-config'), path('instance//file/found/', views.CrawlerInstanceViewSet.as_view({'get': 'files_found'}), name='instance-files-found'), - path('instance//file/success', views.CrawlerInstanceViewSet.as_view({'get': 'success_download_file'}), name='instance-success-download-file'), - path('instance//file/error', views.CrawlerInstanceViewSet.as_view({'get': 'error_download_file'}), name='instance-error-download-file'), - path('instance//file/duplicated', views.CrawlerInstanceViewSet.as_view({'get': 'previously_crawled_file'}), name='instance-duplicated-download-file'), + path('instance//file/success', views.CrawlerInstanceViewSet.as_view({'get': 'file_success'}), name='instance-success-download-file'), + path('instance//file/error', views.CrawlerInstanceViewSet.as_view({'get': 'file_error'}), name='instance-error-download-file'), + path('instance//file/duplicated', views.CrawlerInstanceViewSet.as_view({'get': 'file_duplicated'}), name='instance-duplicated-download-file'), path('task/', views.TaskViewSet.as_view(list_and_create_actions), name='task'), path('task/', views.TaskViewSet.as_view(retrieve_update_and_destroy_actions), name='task-detail'), diff --git a/api/views/crawler_instance.py b/api/views/crawler_instance.py index 0889d6b1..a425e940 100644 --- a/api/views/crawler_instance.py +++ b/api/views/crawler_instance.py @@ -80,13 +80,13 @@ def files_found(self, request, pk, num_files): return self._update_file_info(pk, 'found', num_files) @action(detail=True, methods=['get']) - def success_download_file(self, request, pk): + def file_success(self, request, pk): return self._update_file_info(pk, 'success') @action(detail=True, methods=['get']) - def previously_crawled_file(self, request, pk): + def file_duplicated(self, request, pk): return self._update_file_info(pk, 'duplicated') @action(detail=True, methods=['get']) - def error_download_file(self, request, pk): + def file_error(self, request, pk): return self._update_file_info(pk, 'error') \ No newline at end of file From 8caa81d8371e76c34dd73c5811b723ebfc03f9f8 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Mon, 5 Jun 2023 17:38:35 -0300 Subject: [PATCH 72/89] New endpoints for instance API --- api/urls.py | 8 +++- api/views/crawler_instance.py | 83 +++++++++++++++++++++++++++++------ 2 files changed, 77 insertions(+), 14 deletions(-) diff --git a/api/urls.py b/api/urls.py index 15965fd7..9c6f690e 100644 --- a/api/urls.py +++ b/api/urls.py @@ -34,7 +34,13 @@ path('instance//file/found/', views.CrawlerInstanceViewSet.as_view({'get': 'files_found'}), name='instance-files-found'), path('instance//file/success', views.CrawlerInstanceViewSet.as_view({'get': 'file_success'}), name='instance-success-download-file'), path('instance//file/error', views.CrawlerInstanceViewSet.as_view({'get': 'file_error'}), name='instance-error-download-file'), - path('instance//file/duplicated', views.CrawlerInstanceViewSet.as_view({'get': 'file_duplicated'}), name='instance-duplicated-download-file'), + path('instance//file/previously', views.CrawlerInstanceViewSet.as_view({'get': 'file_previously'}), name='instance-previously-download-file'), + path('instance//page/found/', views.CrawlerInstanceViewSet.as_view({'get': 'pages_found'}), name='instance-pages-found'), + path('instance//page/success', views.CrawlerInstanceViewSet.as_view({'get': 'page_success'}), name='instance-success-download-page'), + path('instance//page/error', views.CrawlerInstanceViewSet.as_view({'get': 'page_error'}), name='instance-error-download-page'), + path('instance//page/previously', views.CrawlerInstanceViewSet.as_view({'get': 'page_previously'}), name='instance-previously-download-page'), + path('instance//page/duplicated', views.CrawlerInstanceViewSet.as_view({'get': 'page_duplicated'}), name='instance-duplicated-download-page'), + path('task/', views.TaskViewSet.as_view(list_and_create_actions), name='task'), path('task/', views.TaskViewSet.as_view(retrieve_update_and_destroy_actions), name='task-detail'), diff --git a/api/views/crawler_instance.py b/api/views/crawler_instance.py index a425e940..dc416bd3 100644 --- a/api/views/crawler_instance.py +++ b/api/views/crawler_instance.py @@ -44,8 +44,9 @@ def export_config(self, request, pk): response['Content-Disposition'] = 'attachment; filename=%s' % file_name return response - - def _update_file_info(self, instance_id, operation: Literal['found', 'success', 'error', 'duplicated'], val: int = 1): + + def _update_download_info(self, instance_id, download_type: Literal['page', 'file'], + operation: Literal['found', 'success', 'error', 'duplicated', 'previously'], val: int = 1): try: instance = CrawlerInstance.objects.get(instance_id=instance_id) @@ -53,17 +54,43 @@ def _update_file_info(self, instance_id, operation: Literal['found', 'success', return Response(status=status.HTTP_404_NOT_FOUND) try: - if operation == 'found': - instance.number_files_found += val + if download_type == 'page': + if operation == 'found': + instance.number_pages_found += val + + elif operation == 'success': + instance.number_pages_success_download += val + + elif operation == 'error': + instance.number_pages_error_download+= val + + elif operation == 'duplicated': + instance.number_pages_duplicated_download += val - elif operation == 'success': - instance.number_files_success_download += val + elif operation == 'previously': + instance.number_pages_previously_crawled += val - elif operation == 'error': - instance.number_files_error_download += val + else: + raise Exception(f'Invalid operation: {operation}') + + elif download_type == 'file': + if operation == 'found': + instance.number_files_found += val - elif operation == 'duplicated': - instance.number_files_previously_crawled += val + elif operation == 'success': + instance.number_files_success_download += val + + elif operation == 'error': + instance.number_files_error_download += val + + elif operation == 'previously': + instance.number_files_previously_crawled += val + + else: + raise Exception(f'Invalid operation: {operation}') + + else: + raise Exception(f'Invalid download type: {download_type}') instance.save() @@ -75,6 +102,16 @@ def _update_file_info(self, instance_id, operation: Literal['found', 'success', except Exception as e: return Response({'error': str(e)}, status=status.HTTP_400_BAD_REQUEST) + def _update_file_info(self, instance_id, + operation: Literal['found', 'success', 'error', 'previously'], + val: int = 1): + return self._update_download_info(instance_id, 'file', operation, val) + + def _update_file_info(self, instance_id, + operation: Literal['found', 'success', 'error', 'previously', 'duplicated'], + val: int = 1): + return self._update_download_info(instance_id, 'page', operation, val) + @action(detail=True, methods=['get']) def files_found(self, request, pk, num_files): return self._update_file_info(pk, 'found', num_files) @@ -84,9 +121,29 @@ def file_success(self, request, pk): return self._update_file_info(pk, 'success') @action(detail=True, methods=['get']) - def file_duplicated(self, request, pk): - return self._update_file_info(pk, 'duplicated') + def file_previously(self, request, pk): + return self._update_file_info(pk, 'previously') @action(detail=True, methods=['get']) def file_error(self, request, pk): - return self._update_file_info(pk, 'error') \ No newline at end of file + return self._update_file_info(pk, 'error') + + @action(detail=True, methods=['get']) + def pages_found(self, request, pk, num_files): + return self._update_page_info(pk, 'found', num_files) + + @action(detail=True, methods=['get']) + def page_success(self, request, pk): + return self._update_page_info(pk, 'success') + + @action(detail=True, methods=['get']) + def page_previously(self, request, pk): + return self._update_page_info(pk, 'previously') + + @action(detail=True, methods=['get']) + def page_duplicated(self, request, pk): + return self._update_page_info(pk, 'duplicated') + + @action(detail=True, methods=['get']) + def page_error(self, request, pk): + return self._update_page_info(pk, 'error') \ No newline at end of file From 1bf6abd48a8f1b808a0bf5f2f87e7dd778ce83c3 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Tue, 6 Jun 2023 11:23:11 -0300 Subject: [PATCH 73/89] Moving instance related functions to its own class --- api/urls.py | 27 ++-- api/views/crawler.py | 1 + api/views/crawler_instance.py | 199 ++++++++++++++++++++++++++++- api/views/crawler_queue.py | 11 +- api/views/debugging/__init__.py | 0 api/views/debugging/log.py | 99 -------------- api/views/debugging/screenshots.py | 55 -------- api/views/debugging/trace.py | 34 ----- api/views/status/__init__.py | 0 api/views/status/page_crawling.py | 86 ------------- crawler_manager/settings.py | 2 +- link_generator/requirements.txt | 8 +- link_generator/src/settings.py | 6 +- spider_manager/requirements.txt | 12 +- spider_manager/src/settings.py | 8 +- writer/requirements.txt | 2 +- writer/src/settings.py | 4 +- 17 files changed, 242 insertions(+), 312 deletions(-) delete mode 100644 api/views/debugging/__init__.py delete mode 100644 api/views/debugging/log.py delete mode 100644 api/views/debugging/screenshots.py delete mode 100644 api/views/debugging/trace.py delete mode 100644 api/views/status/__init__.py delete mode 100644 api/views/status/page_crawling.py diff --git a/api/urls.py b/api/urls.py index 9c6f690e..48b77dc5 100644 --- a/api/urls.py +++ b/api/urls.py @@ -6,13 +6,6 @@ app_name = 'api' -# Router for API endpoints -# api_router = routers.DefaultRouter() -# api_router.register(r'crawlers', views.CrawlerViewSet) -# api_router.register(r'instances', views.CrawlerInstanceViewSet) -# api_router.register(r'crawler_queue', views.CrawlerQueueViewSet) -# api_router.register(r'tasks', views.TaskViewSet) - list_and_create_actions = {'get': 'list', 'post': 'create'} retrieve_update_and_destroy_actions = {'get': 'retrieve', 'put': 'update', 'delete': 'destroy'} all_actions = {'get': 'list', 'post': 'create', 'put': 'update', 'delete': 'destroy'} @@ -22,30 +15,48 @@ urlpatterns = [ path('', lambda request: redirect('api:swagger-ui', permanent=True)), + # crawler info path('crawler/', views.CrawlerViewSet.as_view(list_and_create_actions), name='crawler'), path('crawler/', views.CrawlerViewSet.as_view(retrieve_update_and_destroy_actions), name='crawler-detail'), path('crawler//run', views.CrawlerViewSet.as_view({'get': 'run'}), name='crawler-run'), path('crawler//stop', views.CrawlerViewSet.as_view({'get': 'stop'}), name='crawler-run'), path('crawler//group', views.CrawlerViewSet.as_view({'get': 'group'}), name='crawler-group'), + # instance path('instance/', views.CrawlerInstanceViewSet.as_view(only_list_action), name='instance'), path('instance/', views.CrawlerInstanceViewSet.as_view(only_retrieve_action), name='instance-detail'), - path('instance//export_config', views.CrawlerInstanceViewSet.as_view({'get': 'export_config'}), name='instance-export-config'), + + # instance config export + path('instance//export/config', views.CrawlerInstanceViewSet.as_view({'get': 'export_config'}), name='instance-export-config'), + + # instance update file download status path('instance//file/found/', views.CrawlerInstanceViewSet.as_view({'get': 'files_found'}), name='instance-files-found'), path('instance//file/success', views.CrawlerInstanceViewSet.as_view({'get': 'file_success'}), name='instance-success-download-file'), path('instance//file/error', views.CrawlerInstanceViewSet.as_view({'get': 'file_error'}), name='instance-error-download-file'), path('instance//file/previously', views.CrawlerInstanceViewSet.as_view({'get': 'file_previously'}), name='instance-previously-download-file'), + + # instance update page download status path('instance//page/found/', views.CrawlerInstanceViewSet.as_view({'get': 'pages_found'}), name='instance-pages-found'), path('instance//page/success', views.CrawlerInstanceViewSet.as_view({'get': 'page_success'}), name='instance-success-download-page'), path('instance//page/error', views.CrawlerInstanceViewSet.as_view({'get': 'page_error'}), name='instance-error-download-page'), path('instance//page/previously', views.CrawlerInstanceViewSet.as_view({'get': 'page_previously'}), name='instance-previously-download-page'), path('instance//page/duplicated', views.CrawlerInstanceViewSet.as_view({'get': 'page_duplicated'}), name='instance-duplicated-download-page'), + # instance get logs + path('instance//log/tail', views.CrawlerInstanceViewSet.as_view({'get': 'tail_log'}), name='instance-log-tail'), + path('instance//log/raw/error', views.CrawlerInstanceViewSet.as_view({'get': 'raw_log_err'}), name='instance-log-raw-error'), + path('instance//log/raw/out', views.CrawlerInstanceViewSet.as_view({'get': 'raw_log_out'}), name='instance-log-raw-out'), + # instance debug + path('instance//debug/trace', views.CrawlerInstanceViewSet.as_view({'get': 'export_trace'}), name='instance-debug-trace'), + path('instance//debug/screenshots', views.CrawlerInstanceViewSet.as_view({'get': 'screenshots'}), name='instance-debug-screenshots'), + + # task info path('task/', views.TaskViewSet.as_view(list_and_create_actions), name='task'), path('task/', views.TaskViewSet.as_view(retrieve_update_and_destroy_actions), name='task-detail'), path('task//filter', views.TaskViewSet.as_view({'get': 'filter'}), name='task-filter'), + # queue info path('queue/', views.CrawlerQueueViewSet.as_view({'get': 'retrieve', 'put': 'update'}), name='queue'), path('queue/switch_position//', views.CrawlerQueueViewSet.as_view({'get': 'switch_position'}), name='queue-switch-position'), path('queue/force_execution/', views.CrawlerQueueViewSet.as_view({'get': 'force_execution'}), name='queue-force-execution'), diff --git a/api/views/crawler.py b/api/views/crawler.py index b3b22e00..4e7c8b38 100644 --- a/api/views/crawler.py +++ b/api/views/crawler.py @@ -97,6 +97,7 @@ def run(self, request, pk): def stop(self, request, pk): try: process_stop_crawl(pk) + except Exception as e: return Response({'error': str(e)}, status=status.HTTP_400_BAD_REQUEST) diff --git a/api/views/crawler_instance.py b/api/views/crawler_instance.py index dc416bd3..59308c14 100644 --- a/api/views/crawler_instance.py +++ b/api/views/crawler_instance.py @@ -1,4 +1,9 @@ import os +import subprocess +import time +import os +import base64 +from datetime import datetime from rest_framework import viewsets from rest_framework.decorators import action @@ -7,7 +12,7 @@ from django.http import FileResponse from typing_extensions import Literal -from main.models import CrawlerInstance +from main.models import CrawlerInstance, CrawlRequest from main.utils import process_stop_crawl from main.serializers import CrawlerInstanceSerializer @@ -46,7 +51,8 @@ def export_config(self, request, pk): return response def _update_download_info(self, instance_id, download_type: Literal['page', 'file'], - operation: Literal['found', 'success', 'error', 'duplicated', 'previously'], val: int = 1): + operation: Literal['found', 'success', 'error', 'duplicated', 'previously'], + val: int = 1): try: instance = CrawlerInstance.objects.get(instance_id=instance_id) @@ -54,6 +60,8 @@ def _update_download_info(self, instance_id, download_type: Literal['page', 'fil return Response(status=status.HTTP_404_NOT_FOUND) try: + print(f'Updating {download_type} info for instance {instance_id}') + if download_type == 'page': if operation == 'found': instance.number_pages_found += val @@ -107,7 +115,7 @@ def _update_file_info(self, instance_id, val: int = 1): return self._update_download_info(instance_id, 'file', operation, val) - def _update_file_info(self, instance_id, + def _update_page_info(self, instance_id, operation: Literal['found', 'success', 'error', 'previously', 'duplicated'], val: int = 1): return self._update_download_info(instance_id, 'page', operation, val) @@ -146,4 +154,187 @@ def page_duplicated(self, request, pk): @action(detail=True, methods=['get']) def page_error(self, request, pk): - return self._update_page_info(pk, 'error') \ No newline at end of file + return self._update_page_info(pk, 'error') + + def raw_log_err(self, request, instance_id): + try: + instance = CrawlerInstance.objects.get(instance_id=instance_id) + + except: + return Response({'error': f'Crawler instance {instance_id} not found!'}, status=status.HTTP_404_NOT_FOUND) + + n_lines = int(request.GET.get('n_lines', 100)) + + config = CrawlRequest.objects.filter(id=int(instance.crawler.id)).values()[0] + data_path = os.path.join(OUTPUT_FOLDER, config["data_path"]) + + err = subprocess.run(['tail', + f'{data_path}/{instance_id}/log/{instance_id}.err', + '-n', + f'{n_lines}'], + stdout=subprocess.PIPE).stdout + + raw_text = err.decode('utf-8') + raw_results = raw_text.splitlines(True) + + resp = Response({str(instance_id): raw_results}, json_dumps_params={'indent': 4}, status=status.HTTP_200_OK) + + if len(raw_results) > 0 and instance.running: + resp['Refresh'] = 5 + + return resp + + def raw_log_out(self, request, instance_id): + try: + instance = CrawlerInstance.objects.get(instance_id=instance_id) + + except: + return Response({'error': f'Crawler instance {instance_id} not found!'}, status=status.HTTP_404_NOT_FOUND) + + n_lines = int(request.GET.get('n_lines', 100)) + + config = CrawlRequest.objects.filter(id=int(instance.crawler.id)).values()[0] + data_path = os.path.join(OUTPUT_FOLDER, config["data_path"]) + + out = subprocess.run(['tail', + f'{data_path}/{instance_id}/log/{instance_id}.out', + '-n', + f'{n_lines}'], + stdout=subprocess.PIPE).stdout + + raw_text = out.decode('utf-8') + raw_results = raw_text.splitlines(True) + resp = Response({str(instance_id): raw_results}, json_dumps_params={'indent': 4}) + + if len(raw_results) > 0 and instance.running: + resp['Refresh'] = 5 + + return resp + + @action(detail=True, methods=['get']) + def tail_log(self, request, pk): + try: + instance = CrawlerInstance.objects.get(instance_id=pk) + + except: + return Response({'error': f'Crawler instance {pk} not found!'}, status=status.HTTP_404_NOT_FOUND) + + n_lines = int(request.query_params.get('n_lines', '20')) + + files_found = instance.number_files_found + download_file_success = instance.number_files_success_download + download_file_error = instance.number_files_error_download + number_files_previously_crawled = instance.number_files_previously_crawled + + pages_found = instance.number_pages_found + download_page_success = instance.number_pages_success_download + download_page_error = instance.number_pages_error_download + number_pages_duplicated_download = instance.number_pages_duplicated_download + number_pages_previously_crawled = instance.number_pages_previously_crawled + + config = CrawlRequest.objects.filter(id=int(instance.crawler.id)).values()[0] + data_path = os.path.join(OUTPUT_FOLDER, config["data_path"]) + + out = subprocess.run(['tail', + f'{data_path}/{pk}/log/{pk}.out', + '-n', + f'{n_lines}'], + stdout=subprocess.PIPE).stdout + + err = subprocess.run(['tail', + f'{data_path}/{pk}/log/{pk}.err', + '-n', + f'{n_lines}'], + stdout=subprocess.PIPE).stdout + + return Response({ + 'files_found': files_found, + 'files_success': download_file_success, + 'files_error': download_file_error, + 'files_previously_crawled': number_files_previously_crawled, + 'pages_found': pages_found, + 'pages_success': download_page_success, + 'pages_error': download_page_error, + 'pages_duplicated': number_pages_duplicated_download, + 'pages_previously_crawled': number_pages_previously_crawled, + 'out': out.decode('utf-8'), + 'err': err.decode('utf-8'), + 'time': str(datetime.fromtimestamp(time.time())), + }, status=status.HTTP_200_OK) + + @action(detail=True, methods=['get']) + def screenshots(request, pk): + try: + instance = CrawlerInstance.objects.get(pk=pk) # get_object_or_404(CrawlerInstance, pk=instance_id) + + except: + return Response(status=status.HTTP_404_NOT_FOUND) + + imgs_per_page = int(request.GET.get('imgs_per_page', 20)) + page = int(request.GET.get('page', 1)) + + output_folder = os.getenv('OUTPUT_FOLDER', '/data') + data_path = instance.crawler.data_path + instance_path = os.path.join(output_folder, data_path, str(pk)) + + screenshot_dir = os.path.join(instance_path, 'data', 'screenshots') + + if not os.path.isdir(screenshot_dir): + return Response({ + 'error': 'Path of screenshots not found.', + 'total_screenshots': 0 + }, status=status.HTTP_404_NOT_FOUND) + + screenshot_list = sorted(os.listdir(screenshot_dir)) + total_screenshots = len(screenshot_list) + + if total_screenshots == 0: + return Response({ + 'error': 'None screenshots found.', + 'total_screenshots': 0 + }, status=status.HTTP_404_NOT_FOUND) + + screenshot_list = screenshot_list[(page - 1) * imgs_per_page: + page * imgs_per_page] + + image_data = [] + for index, screenshot in enumerate(screenshot_list): + img_path = os.path.join(screenshot_dir, screenshot) + with open(img_path, "rb") as image: + curr_img = { + 'base64': base64.b64encode(image.read()).decode('ascii'), + 'title': str(1 + index + ((page - 1) * imgs_per_page)) + } + image_data.append(curr_img) + + return Response({ + 'data': image_data, + 'total_screenshots': total_screenshots + }, status=status.HTTP_200_OK) + + @action(detail=True, methods=['get']) + def export_trace(self, request, pk): + try: + instance = CrawlerInstance.objects.get(pk=pk) # get_object_or_404(CrawlerInstance, pk=instance_id) + + except: + return Response(status=status.HTTP_404_NOT_FOUND) + + data_path = instance.crawler.data_path + + file_name = f'{pk}.zip' + rel_path = os.path.join(data_path, str(pk), 'debug', 'trace', file_name) + path = os.path.join(OUTPUT_FOLDER, rel_path) + + try: + response = FileResponse(open(path, 'rb'), content_type='zip') + + except FileNotFoundError: + return Response({'error': 'Verifique se a opção de gerar arquivo trace foi habilitada na configuração do coletor'}, + status=status.HTTP_404_NOT_FOUND) + + else: + response['Content-Length'] = os.path.getsize(path) + response['Content-Disposition'] = 'attachment; filename=%s' % file_name + + return response \ No newline at end of file diff --git a/api/views/crawler_queue.py b/api/views/crawler_queue.py index 42d75b1a..ef53d453 100644 --- a/api/views/crawler_queue.py +++ b/api/views/crawler_queue.py @@ -1,11 +1,10 @@ from django.core.exceptions import ObjectDoesNotExist from django.db import transaction -from django.http import JsonResponse from rest_framework import status, viewsets from rest_framework.decorators import action from rest_framework.response import Response -from main.models import CrawlerQueue, CrawlerQueueItem, CRAWLER_QUEUE_DB_ID +from main.models import CrawlerQueue, CrawlerQueueItem from main.serializers import CrawlerQueueSerializer from main.utils import (process_run_crawl, unqueue_crawl_requests, CRAWLER_QUEUE) @@ -13,7 +12,6 @@ class CrawlerQueueViewSet(viewsets.ModelViewSet): queryset = CrawlerQueue.objects.all() serializer_class = CrawlerQueueSerializer http_method_names = ['get', 'put'] - def retrieve(self, request): crawler_queue = CrawlerQueue.to_dict() return Response(crawler_queue) @@ -44,7 +42,7 @@ def switch_position(self, request, a: int, b: int): queue_item_a.save() queue_item_b.save() - return Response({'message': 'success'}, status=status.HTTP_200_OK) + return Response(status=status.HTTP_200_OK) @action(detail=False, methods=['get']) def force_execution(self, request, item_id: int): @@ -81,8 +79,11 @@ def remove_item(self, request, item_id: int): return Response(status=status.HTTP_204_NO_CONTENT) + def get_object(self): + return CrawlerQueue.object() + def update(self, request): - response = super().update(request, pk=CRAWLER_QUEUE_DB_ID) + response = super().update(request) # updade crawler queue instance with new configs global CRAWLER_QUEUE diff --git a/api/views/debugging/__init__.py b/api/views/debugging/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/api/views/debugging/log.py b/api/views/debugging/log.py deleted file mode 100644 index b409fd95..00000000 --- a/api/views/debugging/log.py +++ /dev/null @@ -1,99 +0,0 @@ -import os -import subprocess -import time -from datetime import datetime - -from rest_framework.response import Response - -from crawler_manager.settings import OUTPUT_FOLDER -from main.models import CrawlRequest, CrawlerInstance - -def raw_log_err(request, instance_id): - instance = CrawlerInstance.objects.get(instance_id=instance_id) - - config = CrawlRequest.objects.filter(id=int(instance.crawler.id)).values()[0] - data_path = os.path.join(OUTPUT_FOLDER, config["data_path"]) - - err = subprocess.run(["tail", - f"{data_path}/{instance_id}/log/{instance_id}.err", - "-n", - "100"], - stdout=subprocess.PIPE).stdout - - raw_text = err.decode('utf-8') - raw_results = raw_text.splitlines(True) - resp = Response({str(instance_id): raw_results}, - json_dumps_params={'indent': 2}) - - if len(raw_results) > 0 and instance.running: - resp['Refresh'] = 5 - - return resp - -def raw_log_out(request, instance_id): - instance = CrawlerInstance.objects.get(instance_id=instance_id) - - config = CrawlRequest.objects.filter(id=int(instance.crawler.id)).values()[0] - data_path = os.path.join(OUTPUT_FOLDER, config["data_path"]) - - out = subprocess.run(["tail", - f"{data_path}/{instance_id}/log/{instance_id}.out", - "-n", - "100"], - stdout=subprocess.PIPE).stdout - - raw_text = out.decode('utf-8') - raw_results = raw_text.splitlines(True) - resp = Response({str(instance_id): raw_results}, - json_dumps_params={'indent': 2}) - - if len(raw_results) > 0 and instance.running: - resp['Refresh'] = 5 - - return resp - -def tail_log_file(request, instance_id): - instance = CrawlerInstance.objects.get(instance_id=instance_id) - - files_found = instance.number_files_found - download_file_success = instance.number_files_success_download - download_file_error = instance.number_files_error_download - number_files_previously_crawled = instance.number_files_previously_crawled - - pages_found = instance.number_pages_found - download_page_success = instance.number_pages_success_download - download_page_error = instance.number_pages_error_download - number_pages_duplicated_download = instance.number_pages_duplicated_download - number_pages_previously_crawled = instance.number_pages_previously_crawled - - config = CrawlRequest.objects.filter(id=int(instance.crawler.id)).values()[0] - data_path = os.path.join(OUTPUT_FOLDER, config["data_path"]) - - out = subprocess.run(["tail", - f"{data_path}/{instance_id}/log/{instance_id}.out", - "-n", - "20"], - stdout=subprocess.PIPE).stdout - - err = subprocess.run(["tail", - f"{data_path}/{instance_id}/log/{instance_id}.err", - "-n", - "20"], - stdout=subprocess.PIPE).stdout - - return Response({ - "files_found": files_found, - "files_success": download_file_success, - "files_error": download_file_error, - "files_previously_crawled": number_files_previously_crawled, - - "pages_found": pages_found, - "pages_success": download_page_success, - "pages_error": download_page_error, - "pages_duplicated": number_pages_duplicated_download, - "pages_previously_crawled": number_pages_previously_crawled, - - "out": out.decode('utf-8'), - "err": err.decode('utf-8'), - "time": str(datetime.fromtimestamp(time.time())), - }) \ No newline at end of file diff --git a/api/views/debugging/screenshots.py b/api/views/debugging/screenshots.py deleted file mode 100644 index ddb1d539..00000000 --- a/api/views/debugging/screenshots.py +++ /dev/null @@ -1,55 +0,0 @@ -import os -import base64 - -from rest_framework.response import Response -from rest_framework import status - -from main.models import CrawlerInstance - -def view_screenshots(request, instance_id, page): - IMGS_PER_PAGE = 20 - - try: - instance = CrawlerInstance.objects.get(pk=instance_id) # get_object_or_404(CrawlerInstance, pk=instance_id) - - except: - return Response(status=status.HTTP_404_NOT_FOUND) - - output_folder = os.getenv('OUTPUT_FOLDER', '/data') - data_path = instance.crawler.data_path - instance_path = os.path.join(output_folder, data_path, str(instance_id)) - - screenshot_dir = os.path.join(instance_path, "data", "screenshots") - - if not os.path.isdir(screenshot_dir): - return Response({ - 'error': 'Pasta de coleta não encontrada.', - 'total_screenshots': 0 - }, status=status.HTTP_200_OK) - - screenshot_list = sorted(os.listdir(screenshot_dir)) - total_screenshots = len(screenshot_list) - - if total_screenshots == 0: - return Response({ - 'error': 'Nenhum screenshot encontrado.', - 'total_screenshots': 0 - }, status=status.HTTP_200_OK) - - screenshot_list = screenshot_list[(page - 1) * IMGS_PER_PAGE: - page * IMGS_PER_PAGE] - - image_data = [] - for index, screenshot in enumerate(screenshot_list): - img_path = os.path.join(screenshot_dir, screenshot) - with open(img_path, "rb") as image: - curr_img = { - 'base64': base64.b64encode(image.read()).decode('ascii'), - 'title': str(1 + index + ((page - 1) * IMGS_PER_PAGE)) - } - image_data.append(curr_img) - - return Response({ - 'data': image_data, - 'total_screenshots': total_screenshots - }, status=status.HTTP_200_OK) \ No newline at end of file diff --git a/api/views/debugging/trace.py b/api/views/debugging/trace.py deleted file mode 100644 index 1fdb0fc4..00000000 --- a/api/views/debugging/trace.py +++ /dev/null @@ -1,34 +0,0 @@ -import os -from rest_framework.response import Response -from rest_framework import status - -from django.http import FileResponse -from main.models import CrawlerInstance - -from crawler_manager.settings import OUTPUT_FOLDER - -def export_trace(request, instance_id): - try: - instance = CrawlerInstance.objects.get(pk=instance_id) # get_object_or_404(CrawlerInstance, pk=instance_id) - - except: - return Response(status=status.HTTP_404_NOT_FOUND) - - data_path = instance.crawler.data_path - - file_name = f"{instance_id}.zip" - rel_path = os.path.join(data_path, str(instance_id), "debug", "trace", file_name) - path = os.path.join(OUTPUT_FOLDER, rel_path) - - try: - response = FileResponse(open(path, 'rb'), content_type='zip') - - except FileNotFoundError: - return Response({'error': 'Verifique se a opção de gerar arquivo trace foi habilitada na configuração do coletor'}, - status=status.HTTP_404_NOT_FOUND) - - else: - response['Content-Length'] = os.path.getsize(path) - response['Content-Disposition'] = "attachment; filename=%s" % file_name - - return response \ No newline at end of file diff --git a/api/views/status/__init__.py b/api/views/status/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/api/views/status/page_crawling.py b/api/views/status/page_crawling.py deleted file mode 100644 index b521710a..00000000 --- a/api/views/status/page_crawling.py +++ /dev/null @@ -1,86 +0,0 @@ -from rest_framework.response import Response -from rest_framework import status - -from main.models import CrawlerInstance - -def pages_found(request, instance_id, num_pages): - try: - instance = CrawlerInstance.objects.get(instance_id=instance_id) - - except: - return Response(status=status.HTTP_404_NOT_FOUND) - - try: - instance.number_pages_found += num_pages - instance.save() - - return Response(status=status.HTTP_200_OK) - - except Exception as e: - return Response({'error': str(e)}, status=status.HTTP_400_BAD_REQUEST) - - -def success_download_page(request, instance_id): - try: - instance = CrawlerInstance.objects.get(instance_id=instance_id) - - except: - return Response(status=status.HTTP_404_NOT_FOUND) - - try: - instance.number_pages_success_download += 1 - instance.save() - - return Response(status=status.HTTP_200_OK) - - except Exception as e: - return Response({'error': str(e)}, status=status.HTTP_400_BAD_REQUEST) - -def previously_crawled_page(request, instance_id): - try: - instance = CrawlerInstance.objects.get(instance_id=instance_id) - - except: - return Response(status=status.HTTP_404_NOT_FOUND) - - try: - instance.number_pages_previously_crawled += 1 - instance.save() - - return Response(status=status.HTTP_200_OK) - - except Exception as e: - return Response({'error': str(e)}, status=status.HTTP_400_BAD_REQUEST) - -def error_download_page(request, instance_id): - try: - instance = CrawlerInstance.objects.get(instance_id=instance_id) - - except: - return Response(status=status.HTTP_404_NOT_FOUND) - - try: - instance.number_pages_error_download += 1 - instance.save() - - return Response(status=status.HTTP_200_OK) - - except Exception as e: - return Response({'error': str(e)}, status=status.HTTP_400_BAD_REQUEST) - - -def duplicated_download_page(request, instance_id): - try: - instance = CrawlerInstance.objects.get(instance_id=instance_id) - - except: - return Response(status=status.HTTP_404_NOT_FOUND) - - try: - instance.number_pages_duplicated_download += 1 - instance.save() - - return Response(status=status.HTTP_200_OK) - - except Exception as e: - return Response({'error': str(e)}, status=status.HTTP_400_BAD_REQUEST) diff --git a/crawler_manager/settings.py b/crawler_manager/settings.py index d25071ee..de59bc00 100644 --- a/crawler_manager/settings.py +++ b/crawler_manager/settings.py @@ -44,4 +44,4 @@ TASK_TOPIC = os.getenv('TASK_TOPIC', KAFKA_TOPIC_PREFIX + 'task_topic') TASK_DATA_CONSUMER_GROUP = os.getenv('TASK_DATA_CONSUMER_DATA', KAFKA_TOPIC_PREFIX + '.task_data_group') -OUTPUT_FOLDER = os.getenv('OUTPUT_FOLDER', '/data') \ No newline at end of file +OUTPUT_FOLDER = os.getenv('OUTPUT_FOLDER', '/home/elves/Desktop/data') \ No newline at end of file diff --git a/link_generator/requirements.txt b/link_generator/requirements.txt index 569dc00e..b4580e96 100644 --- a/link_generator/requirements.txt +++ b/link_generator/requirements.txt @@ -1,7 +1,7 @@ -../entry_probing/ -../param_injector -../range_inference/ -../crawling_utils/ +../src/entry_probing/ +../src/param_injector +../src/range_inference/ +../src/crawling_utils/ kafka-python==2.0.2 redis==3.5.3 tldextract==3.1.0 diff --git a/link_generator/src/settings.py b/link_generator/src/settings.py index d8486de2..596fb9ab 100644 --- a/link_generator/src/settings.py +++ b/link_generator/src/settings.py @@ -2,7 +2,7 @@ KAFKA_TOPIC_PREFIX = os.getenv('KAFKA_TOPIC_PREFIX', 'crawler_ufmg') -KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'kafka:9092').split(',')] +KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'localhost:9092').split(',')] KAFKA_CONSUMER_AUTO_OFFSET_RESET = 'earliest' KAFKA_CONSUMER_TIMEOUT = 120000 KAFKA_CONSUMER_COMMIT_INTERVAL_MS = 5000 @@ -15,7 +15,7 @@ KAFKA_SESSION_TIMEOUT_MS = 2 * 60 * 1000 # Redis host information -REDIS_HOST = os.getenv('REDIS_HOST', 'redis') +REDIS_HOST = os.getenv('REDIS_HOST', 'localhost') REDIS_PORT = int(os.getenv('REDIS_PORT', 6379)) REDIS_DB = int(os.getenv('REDIS_DB', 0)) REDIS_PASSWORD = os.getenv('REDIS_PASSWORD', None) @@ -23,7 +23,7 @@ # django application port SERVER_NEW_PAGE_FOUND_URL = os.getenv('SERVER_NEW_PAGE_FOUND_URL', - 'http://web_server:8000/download/pages/found/{instance_id}/{num_pages}') + 'http://localhost:8000/api/instance/{instance_id}/page/found/{num_pages}') # Kafka topics LINK_GENERATOR_TOPIC = os.getenv('LINK_GENERATOR_TOPIC', KAFKA_TOPIC_PREFIX + '.link_generator') diff --git a/spider_manager/requirements.txt b/spider_manager/requirements.txt index d04463cd..1be5ea23 100644 --- a/spider_manager/requirements.txt +++ b/spider_manager/requirements.txt @@ -1,9 +1,9 @@ -../crawling_utils/ -../step-by-step/ -../camuflage_scrapy/ -../captcha_solver/ -../cssify/ -../scutils/ +../src/crawling_utils/ +../src/step-by-step/ +../src/camuflage_scrapy/ +../src/captcha_solver/ +../src/cssify/ +./scutils/ attrs>=19.2.0 # 18.1.0 # Updated from 17.2.0' cchardet==2.1.7 cffi==1.12.3 # Updated from 1.10.0 diff --git a/spider_manager/src/settings.py b/spider_manager/src/settings.py index 2127fa21..e2715e68 100644 --- a/spider_manager/src/settings.py +++ b/spider_manager/src/settings.py @@ -3,12 +3,12 @@ # Zookeeper host information ZOOKEEPER_ASSIGN_PATH = os.getenv('ZOOKEEPER_ASSIGN_PATH', '/scrapy-cluster/crawler/') ZOOKEEPER_ID = os.getenv('ZOOKEEPER_ID', 'all') -ZOOKEEPER_HOSTS = [x.strip() for x in os.getenv('ZOOKEEPER_HOSTS', 'zookeeper:2181').split(',')] +ZOOKEEPER_HOSTS = [x.strip() for x in os.getenv('ZOOKEEPER_HOSTS', 'localhost:2181').split(',')] # Kafka host information KAFKA_TOPIC_PREFIX = os.getenv('KAFKA_TOPIC_PREFIX', 'crawler_ufmg') -KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'kafka:9092').split(',')] +KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'localhost:9092').split(',')] KAFKA_CONSUMER_AUTO_OFFSET_RESET = 'earliest' KAFKA_CONSUMER_TIMEOUT = 120000 KAFKA_CONSUMER_COMMIT_INTERVAL_MS = 5000 @@ -21,7 +21,7 @@ KAFKA_SESSION_TIMEOUT_MS = 2 * 60 * 1000 # Redis host information -REDIS_HOST = os.getenv('REDIS_HOST', 'redis') +REDIS_HOST = os.getenv('REDIS_HOST', 'localhost') REDIS_PORT = int(os.getenv('REDIS_PORT', 6379)) REDIS_DB = int(os.getenv('REDIS_DB', 0)) REDIS_PASSWORD = os.getenv('REDIS_PASSWORD', None) @@ -33,4 +33,4 @@ COMMANDS_TOPIC = os.getenv('COMMANDS_TOPIC', KAFKA_TOPIC_PREFIX + '.commands') NOTIFICATIONS_TOPIC = os.getenv('NOTIFICATIONS_TOPIC', KAFKA_TOPIC_PREFIX + '.notifications') -OUTPUT_FOLDER = os.getenv('OUTPUT_FOLDER', '/data') +OUTPUT_FOLDER = os.getenv('OUTPUT_FOLDER', '/home/elves/Desktop/data') diff --git a/writer/requirements.txt b/writer/requirements.txt index 542d46fe..7772d232 100644 --- a/writer/requirements.txt +++ b/writer/requirements.txt @@ -1,4 +1,4 @@ -../crawling_utils/ +../src/crawling_utils/ beautifulsoup4==4.11.1 certifi==2021.5.30 chardet==4.0.0 diff --git a/writer/src/settings.py b/writer/src/settings.py index 8c6c1488..aa7b454a 100644 --- a/writer/src/settings.py +++ b/writer/src/settings.py @@ -1,6 +1,6 @@ import os -KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'kafka:9092').split(',')] +KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'localhost:9092').split(',')] KAFKA_TOPIC_PREFIX = os.getenv('KAFKA_TOPIC_PREFIX', 'crawler_ufmg') KAFKA_CONSUMER_AUTO_OFFSET_RESET = 'earliest' @@ -32,4 +32,4 @@ FILE_DOWNLOADER_CONSUMER_GROUP = os.getenv('FILE_DOWNLOADER_CONSUMER_GROUP', KAFKA_TOPIC_PREFIX + '.file_downloader_group') FILE_DESCRIPTOR_CONSUMER_GROUP = os.getenv('FILE_DESCRIPTOR_CONSUMER_GROUP', KAFKA_TOPIC_PREFIX + '.file_descriptor_group') -OUTPUT_FOLDER = os.getenv('OUTPUT_FOLDER', '/data') \ No newline at end of file +OUTPUT_FOLDER = os.getenv('OUTPUT_FOLDER', '/home/elves/Desktop/data') \ No newline at end of file From f8f2c52b476036dfb26817122910d12286ec6966 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Tue, 6 Jun 2023 14:35:56 -0300 Subject: [PATCH 74/89] Errors fixed when download the instance config --- api/views/crawler_instance.py | 22 +++++++++++----------- crawler_manager/settings.py | 8 ++++---- docker/django-gunicorn/Dockerfile | 1 + link_generator/requirements.txt | 8 ++++---- link_generator/src/settings.py | 6 +++--- main/utils.py | 4 +++- main/views.py | 1 - spider_manager/requirements.txt | 12 ++++++------ spider_manager/src/settings.py | 8 ++++---- writer/requirements.txt | 2 +- writer/src/settings.py | 4 ++-- 11 files changed, 39 insertions(+), 37 deletions(-) diff --git a/api/views/crawler_instance.py b/api/views/crawler_instance.py index 59308c14..def075b3 100644 --- a/api/views/crawler_instance.py +++ b/api/views/crawler_instance.py @@ -9,7 +9,7 @@ from rest_framework.decorators import action from rest_framework.response import Response from rest_framework import status -from django.http import FileResponse +from django.http import FileResponse, JsonResponse from typing_extensions import Literal from main.models import CrawlerInstance, CrawlRequest @@ -156,12 +156,12 @@ def page_duplicated(self, request, pk): def page_error(self, request, pk): return self._update_page_info(pk, 'error') - def raw_log_err(self, request, instance_id): + def raw_log_err(self, request, pk): try: - instance = CrawlerInstance.objects.get(instance_id=instance_id) + instance = CrawlerInstance.objects.get(instance_id=pk) except: - return Response({'error': f'Crawler instance {instance_id} not found!'}, status=status.HTTP_404_NOT_FOUND) + return Response({'error': f'Crawler instance {pk} not found!'}, status=status.HTTP_404_NOT_FOUND) n_lines = int(request.GET.get('n_lines', 100)) @@ -169,7 +169,7 @@ def raw_log_err(self, request, instance_id): data_path = os.path.join(OUTPUT_FOLDER, config["data_path"]) err = subprocess.run(['tail', - f'{data_path}/{instance_id}/log/{instance_id}.err', + f'{data_path}/{pk}/log/{pk}.err', '-n', f'{n_lines}'], stdout=subprocess.PIPE).stdout @@ -177,19 +177,19 @@ def raw_log_err(self, request, instance_id): raw_text = err.decode('utf-8') raw_results = raw_text.splitlines(True) - resp = Response({str(instance_id): raw_results}, json_dumps_params={'indent': 4}, status=status.HTTP_200_OK) + resp = JsonResponse({str(pk): raw_results}, json_dumps_params={'indent': 4}, status=status.HTTP_200_OK) if len(raw_results) > 0 and instance.running: resp['Refresh'] = 5 return resp - def raw_log_out(self, request, instance_id): + def raw_log_out(self, request, pk): try: - instance = CrawlerInstance.objects.get(instance_id=instance_id) + instance = CrawlerInstance.objects.get(instance_id=pk) except: - return Response({'error': f'Crawler instance {instance_id} not found!'}, status=status.HTTP_404_NOT_FOUND) + return Response({'error': f'Crawler instance {pk} not found!'}, status=status.HTTP_404_NOT_FOUND) n_lines = int(request.GET.get('n_lines', 100)) @@ -197,14 +197,14 @@ def raw_log_out(self, request, instance_id): data_path = os.path.join(OUTPUT_FOLDER, config["data_path"]) out = subprocess.run(['tail', - f'{data_path}/{instance_id}/log/{instance_id}.out', + f'{data_path}/{pk}/log/{pk}.out', '-n', f'{n_lines}'], stdout=subprocess.PIPE).stdout raw_text = out.decode('utf-8') raw_results = raw_text.splitlines(True) - resp = Response({str(instance_id): raw_results}, json_dumps_params={'indent': 4}) + resp = JsonResponse({str(pk): raw_results}, json_dumps_params={'indent': 4}) if len(raw_results) > 0 and instance.running: resp['Refresh'] = 5 diff --git a/crawler_manager/settings.py b/crawler_manager/settings.py index de59bc00..6a07b7a3 100644 --- a/crawler_manager/settings.py +++ b/crawler_manager/settings.py @@ -3,7 +3,7 @@ # Kafka host information KAFKA_TOPIC_PREFIX = os.getenv('KAFKA_TOPIC_PREFIX', 'crawler_ufmg') -KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'localhost:9092').split(',')] +KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'kafka:9092').split(',')] KAFKA_CONSUMER_AUTO_OFFSET_RESET = 'earliest' KAFKA_CONSUMER_TIMEOUT = 120000 KAFKA_CONSUMER_COMMIT_INTERVAL_MS = 5000 @@ -16,7 +16,7 @@ KAFKA_SESSION_TIMEOUT_MS = 2 * 60 * 1000 # Redis host information -REDIS_HOST = os.getenv('REDIS_HOST', 'localhost') +REDIS_HOST = os.getenv('REDIS_HOST', 'redis') REDIS_PORT = int(os.getenv('REDIS_PORT', 6379)) REDIS_DB = int(os.getenv('REDIS_DB', 0)) REDIS_PASSWORD = os.getenv('REDIS_PASSWORD', None) @@ -39,9 +39,9 @@ WRITER_TOPIC = os.getenv('WRITER_TOPIC', KAFKA_TOPIC_PREFIX + '.writer') STOPPED_SPIDER_NOTIFICATION_ADDRESS = os.getenv( - 'STOPPED_SPIDER_NOTIFICATION_ADDRESS', 'http://localhost:8000/detail/stop_crawl/{crawler_id}') + 'STOPPED_SPIDER_NOTIFICATION_ADDRESS', 'http://web:8000/detail/stop_crawl/{crawler_id}') TASK_TOPIC = os.getenv('TASK_TOPIC', KAFKA_TOPIC_PREFIX + 'task_topic') TASK_DATA_CONSUMER_GROUP = os.getenv('TASK_DATA_CONSUMER_DATA', KAFKA_TOPIC_PREFIX + '.task_data_group') -OUTPUT_FOLDER = os.getenv('OUTPUT_FOLDER', '/home/elves/Desktop/data') \ No newline at end of file +OUTPUT_FOLDER = os.getenv('OUTPUT_FOLDER', '/data') \ No newline at end of file diff --git a/docker/django-gunicorn/Dockerfile b/docker/django-gunicorn/Dockerfile index 3848217b..1829348f 100644 --- a/docker/django-gunicorn/Dockerfile +++ b/docker/django-gunicorn/Dockerfile @@ -31,6 +31,7 @@ RUN python3 web_install.py RUN pip install pyee==9 COPY main main +COPY api api COPY interface interface COPY crawler_manager crawler_manager diff --git a/link_generator/requirements.txt b/link_generator/requirements.txt index b4580e96..569dc00e 100644 --- a/link_generator/requirements.txt +++ b/link_generator/requirements.txt @@ -1,7 +1,7 @@ -../src/entry_probing/ -../src/param_injector -../src/range_inference/ -../src/crawling_utils/ +../entry_probing/ +../param_injector +../range_inference/ +../crawling_utils/ kafka-python==2.0.2 redis==3.5.3 tldextract==3.1.0 diff --git a/link_generator/src/settings.py b/link_generator/src/settings.py index 596fb9ab..898e4549 100644 --- a/link_generator/src/settings.py +++ b/link_generator/src/settings.py @@ -2,7 +2,7 @@ KAFKA_TOPIC_PREFIX = os.getenv('KAFKA_TOPIC_PREFIX', 'crawler_ufmg') -KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'localhost:9092').split(',')] +KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'kafka:9092').split(',')] KAFKA_CONSUMER_AUTO_OFFSET_RESET = 'earliest' KAFKA_CONSUMER_TIMEOUT = 120000 KAFKA_CONSUMER_COMMIT_INTERVAL_MS = 5000 @@ -15,7 +15,7 @@ KAFKA_SESSION_TIMEOUT_MS = 2 * 60 * 1000 # Redis host information -REDIS_HOST = os.getenv('REDIS_HOST', 'localhost') +REDIS_HOST = os.getenv('REDIS_HOST', 'redis') REDIS_PORT = int(os.getenv('REDIS_PORT', 6379)) REDIS_DB = int(os.getenv('REDIS_DB', 0)) REDIS_PASSWORD = os.getenv('REDIS_PASSWORD', None) @@ -23,7 +23,7 @@ # django application port SERVER_NEW_PAGE_FOUND_URL = os.getenv('SERVER_NEW_PAGE_FOUND_URL', - 'http://localhost:8000/api/instance/{instance_id}/page/found/{num_pages}') + 'http://web:8000/api/instance/{instance_id}/page/found/{num_pages}') # Kafka topics LINK_GENERATOR_TOPIC = os.getenv('LINK_GENERATOR_TOPIC', KAFKA_TOPIC_PREFIX + '.link_generator') diff --git a/main/utils.py b/main/utils.py index 3bdd9f5a..8504e524 100644 --- a/main/utils.py +++ b/main/utils.py @@ -1,4 +1,4 @@ -from typing import Literal +from typing_extensions import Literal from django.db import transaction from django.utils import timezone @@ -6,6 +6,8 @@ from main.models import CrawlRequest, CrawlerInstance, CrawlerQueue, CrawlerQueueItem from main.forms import ParameterHandlerFormSet, ResponseHandlerFormSet +CRAWLER_QUEUE = None + try: CRAWLER_QUEUE = CrawlerQueue.object() diff --git a/main/views.py b/main/views.py index d1233a3a..431237c4 100644 --- a/main/views.py +++ b/main/views.py @@ -2,7 +2,6 @@ import multiprocessing as mp from django.core.paginator import Paginator -from django.db.models import Q from django.http import HttpResponse, HttpResponseRedirect from django.shortcuts import get_object_or_404, redirect, render diff --git a/spider_manager/requirements.txt b/spider_manager/requirements.txt index 1be5ea23..d04463cd 100644 --- a/spider_manager/requirements.txt +++ b/spider_manager/requirements.txt @@ -1,9 +1,9 @@ -../src/crawling_utils/ -../src/step-by-step/ -../src/camuflage_scrapy/ -../src/captcha_solver/ -../src/cssify/ -./scutils/ +../crawling_utils/ +../step-by-step/ +../camuflage_scrapy/ +../captcha_solver/ +../cssify/ +../scutils/ attrs>=19.2.0 # 18.1.0 # Updated from 17.2.0' cchardet==2.1.7 cffi==1.12.3 # Updated from 1.10.0 diff --git a/spider_manager/src/settings.py b/spider_manager/src/settings.py index e2715e68..2127fa21 100644 --- a/spider_manager/src/settings.py +++ b/spider_manager/src/settings.py @@ -3,12 +3,12 @@ # Zookeeper host information ZOOKEEPER_ASSIGN_PATH = os.getenv('ZOOKEEPER_ASSIGN_PATH', '/scrapy-cluster/crawler/') ZOOKEEPER_ID = os.getenv('ZOOKEEPER_ID', 'all') -ZOOKEEPER_HOSTS = [x.strip() for x in os.getenv('ZOOKEEPER_HOSTS', 'localhost:2181').split(',')] +ZOOKEEPER_HOSTS = [x.strip() for x in os.getenv('ZOOKEEPER_HOSTS', 'zookeeper:2181').split(',')] # Kafka host information KAFKA_TOPIC_PREFIX = os.getenv('KAFKA_TOPIC_PREFIX', 'crawler_ufmg') -KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'localhost:9092').split(',')] +KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'kafka:9092').split(',')] KAFKA_CONSUMER_AUTO_OFFSET_RESET = 'earliest' KAFKA_CONSUMER_TIMEOUT = 120000 KAFKA_CONSUMER_COMMIT_INTERVAL_MS = 5000 @@ -21,7 +21,7 @@ KAFKA_SESSION_TIMEOUT_MS = 2 * 60 * 1000 # Redis host information -REDIS_HOST = os.getenv('REDIS_HOST', 'localhost') +REDIS_HOST = os.getenv('REDIS_HOST', 'redis') REDIS_PORT = int(os.getenv('REDIS_PORT', 6379)) REDIS_DB = int(os.getenv('REDIS_DB', 0)) REDIS_PASSWORD = os.getenv('REDIS_PASSWORD', None) @@ -33,4 +33,4 @@ COMMANDS_TOPIC = os.getenv('COMMANDS_TOPIC', KAFKA_TOPIC_PREFIX + '.commands') NOTIFICATIONS_TOPIC = os.getenv('NOTIFICATIONS_TOPIC', KAFKA_TOPIC_PREFIX + '.notifications') -OUTPUT_FOLDER = os.getenv('OUTPUT_FOLDER', '/home/elves/Desktop/data') +OUTPUT_FOLDER = os.getenv('OUTPUT_FOLDER', '/data') diff --git a/writer/requirements.txt b/writer/requirements.txt index 7772d232..542d46fe 100644 --- a/writer/requirements.txt +++ b/writer/requirements.txt @@ -1,4 +1,4 @@ -../src/crawling_utils/ +../crawling_utils/ beautifulsoup4==4.11.1 certifi==2021.5.30 chardet==4.0.0 diff --git a/writer/src/settings.py b/writer/src/settings.py index aa7b454a..8c6c1488 100644 --- a/writer/src/settings.py +++ b/writer/src/settings.py @@ -1,6 +1,6 @@ import os -KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'localhost:9092').split(',')] +KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'kafka:9092').split(',')] KAFKA_TOPIC_PREFIX = os.getenv('KAFKA_TOPIC_PREFIX', 'crawler_ufmg') KAFKA_CONSUMER_AUTO_OFFSET_RESET = 'earliest' @@ -32,4 +32,4 @@ FILE_DOWNLOADER_CONSUMER_GROUP = os.getenv('FILE_DOWNLOADER_CONSUMER_GROUP', KAFKA_TOPIC_PREFIX + '.file_downloader_group') FILE_DESCRIPTOR_CONSUMER_GROUP = os.getenv('FILE_DESCRIPTOR_CONSUMER_GROUP', KAFKA_TOPIC_PREFIX + '.file_descriptor_group') -OUTPUT_FOLDER = os.getenv('OUTPUT_FOLDER', '/home/elves/Desktop/data') \ No newline at end of file +OUTPUT_FOLDER = os.getenv('OUTPUT_FOLDER', '/data') \ No newline at end of file From ece26b91c4e4c4749050005aef9bcc9d80e5b8e1 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Tue, 6 Jun 2023 17:07:51 -0300 Subject: [PATCH 75/89] Using swagger --- api/urls.py | 49 +++++++++++++----------------- api/views/crawler.py | 56 +++++++++++++++++++++++++++++++++-- api/views/crawler_instance.py | 2 +- api/views/task.py | 30 +++++++++++++++++-- interface/settings.py | 1 + 5 files changed, 104 insertions(+), 34 deletions(-) diff --git a/api/urls.py b/api/urls.py index 48b77dc5..09cc3dbe 100644 --- a/api/urls.py +++ b/api/urls.py @@ -1,7 +1,10 @@ +from django.shortcuts import redirect from django.urls import path from django.views.generic import TemplateView -from rest_framework.schemas import get_schema_view -from django.shortcuts import redirect +from drf_yasg import openapi +from drf_yasg.views import get_schema_view +from rest_framework import permissions + from . import views app_name = 'api' @@ -12,8 +15,19 @@ only_list_action = {'get': 'list'} only_retrieve_action = {'get': 'retrieve'} +schema_view = get_schema_view( + openapi.Info( + title="Plataforma de Coletas - API", + default_version="1.0", + description="API para as principais funcionalidades da plataforma de coletas.", + license=openapi.License(name="BSD License"), + ), + public=True, + permission_classes=(permissions.AllowAny,), +) + urlpatterns = [ - path('', lambda request: redirect('api:swagger-ui', permanent=True)), + path('', lambda request: redirect('api:swagger', permanent=True)), # crawler info path('crawler/', views.CrawlerViewSet.as_view(list_and_create_actions), name='crawler'), @@ -43,7 +57,7 @@ path('instance//page/duplicated', views.CrawlerInstanceViewSet.as_view({'get': 'page_duplicated'}), name='instance-duplicated-download-page'), # instance get logs - path('instance//log/tail', views.CrawlerInstanceViewSet.as_view({'get': 'tail_log'}), name='instance-log-tail'), + path('instance//log/tail', views.CrawlerInstanceViewSet.as_view({'get': 'tail'}), name='instance-log-tail'), path('instance//log/raw/error', views.CrawlerInstanceViewSet.as_view({'get': 'raw_log_err'}), name='instance-log-raw-error'), path('instance//log/raw/out', views.CrawlerInstanceViewSet.as_view({'get': 'raw_log_out'}), name='instance-log-raw-out'), @@ -62,28 +76,5 @@ path('queue/force_execution/', views.CrawlerQueueViewSet.as_view({'get': 'force_execution'}), name='queue-force-execution'), path('queue/remove_item/', views.CrawlerQueueViewSet.as_view({'get': 'remove_item'}), name='queue-remove-item'), - path('open-api/', get_schema_view( - title='Plataforma de Coletas - API', - description='API para as principais funcionalidades da plataforma de coletas.', - version='1.0.0', - public=True, - url='/api/', - urlconf='api.urls' - ), name='open-api'), - path('swagger-ui/', TemplateView.as_view( - template_name='api/swagger-ui.html', - extra_context={'schema_url':'api:open-api'} - ), name='swagger-ui') -] - -# # Includes the API endpoints in the URLs -# url(r'^api/', include(api_router.urls)), -# path('openapi/', get_schema_view( -# title='Áduna', -# description='API para busca de dados não estruturados', -# url='/services/', -# version='1.0.0', -# urlconf='services.urls', -# public=True, -# ), name='openapi'), -# ] + path('swagger/', schema_view.with_ui('swagger'), name='swagger'), +] \ No newline at end of file diff --git a/api/views/crawler.py b/api/views/crawler.py index 4e7c8b38..88a1a519 100644 --- a/api/views/crawler.py +++ b/api/views/crawler.py @@ -10,6 +10,9 @@ from main.utils import (add_crawl_request, unqueue_crawl_requests, process_run_crawl, process_stop_crawl) +from drf_yasg.utils import swagger_auto_schema +from drf_yasg import openapi + class CrawlerViewSet(viewsets.ModelViewSet): """ ViewSet that allows crawlers to be viewed, edited, updated and removed. @@ -29,6 +32,10 @@ def _create_templated_url_response_handlers(self, response_handlers, crawler_id) handler['injection_type'] = 'templated_url' ResponseHandler.objects.create(**handler) + @swagger_auto_schema( + operation_summary='Cria um novo coletor.', + operation_description='Ao chamar por esse endpoint, um novo coletor será criado e retornado em formato JSON.', + ) def create(self, request, *args, **kwargs): """ Create a new crawler. @@ -55,10 +62,31 @@ def create(self, request, *args, **kwargs): return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST) + @swagger_auto_schema( + operation_summary='Executa o coletor.', + operation_description='Ao chamar por esse endpoint, o coletor irá para a fila de coletas. A fila em que aguardará depende de seu parâmetro `expected_runtime_category`.', + manual_parameters=[ + openapi.Parameter( + 'id', + openapi.IN_PATH, + description='ID único do crawler.', + type=openapi.TYPE_INTEGER + ), + openapi.Parameter( + 'action', + openapi.IN_QUERY, + description='Esse parâmetro permite definir o comportamento do coletor ao chegar na fila de coletas, sendo as opções: `run_immediately` (xecuta imediatamente), `wait_on_first_queue_position` (aguarda execução na primeira posição da fila) e `wait_on_last_queue_position` (aguarda na última posição).', + type=openapi.TYPE_STRING, + default='wait_on_last_queue_position', + enum=['run_immediately', 'wait_on_first_queue_position', 'wait_on_last_queue_position'], + required=False + ) + ], + ) @action(detail=True, methods=['get']) def run(self, request, pk): query_params = self.request.query_params.dict() - action = query_params.get('action', '') + action = query_params.get('action', 'wait_on_last_queue_position') if action == 'run_immediately': wait_on = 'no_wait' @@ -93,6 +121,18 @@ def run(self, request, pk): return Response({'message': message}, status=status.HTTP_200_OK) + @swagger_auto_schema( + operation_summary='Interrompe o coletor.', + operation_description='Ao chamar por esse endpoint, o coletor comecará o seu processo de encerramento, que pode ou não ser imediato.', + manual_parameters=[ + openapi.Parameter( + 'id', + openapi.IN_PATH, + description='ID único do crawler.', + type=openapi.TYPE_INTEGER + ), + ] + ) @action(detail=True, methods=['get']) def stop(self, request, pk): try: @@ -102,7 +142,19 @@ def stop(self, request, pk): return Response({'error': str(e)}, status=status.HTTP_400_BAD_REQUEST) return Response(status=status.HTTP_204_NO_CONTENT) - + + @swagger_auto_schema( + operation_summary='Cria agrupamentos baseado em determinado coletor.', + operation_description='Retorna um grupo é de coletores dinâmicos que possuem os mesmos passos que o coletor de `id` passado como parâmetro.', + manual_parameters=[ + openapi.Parameter( + 'id', + openapi.IN_PATH, + description='ID único do crawler.', + type=openapi.TYPE_INTEGER + ), + ] + ) @action(detail=True, methods=['get']) def group(self, request, pk): crawlers = CrawlRequest.objects.raw( diff --git a/api/views/crawler_instance.py b/api/views/crawler_instance.py index def075b3..c3da0148 100644 --- a/api/views/crawler_instance.py +++ b/api/views/crawler_instance.py @@ -212,7 +212,7 @@ def raw_log_out(self, request, pk): return resp @action(detail=True, methods=['get']) - def tail_log(self, request, pk): + def tail(self, request, pk): try: instance = CrawlerInstance.objects.get(instance_id=pk) diff --git a/api/views/task.py b/api/views/task.py index ac0015bf..72f0f1d2 100644 --- a/api/views/task.py +++ b/api/views/task.py @@ -11,11 +11,17 @@ import crawler_manager.crawler_manager as crawler_manager from crawler_manager.settings import TASK_TOPIC + +from drf_yasg.utils import swagger_auto_schema class TaskViewSet(viewsets.ModelViewSet): queryset = Task.objects.all() serializer_class = TaskSerializer + @swagger_auto_schema( + operation_summary="Run custom logic", + operation_description="This is the description of the `run` method.", + ) def create(self, request): response = super().create(request) if response.status_code == status.HTTP_201_CREATED: @@ -26,6 +32,10 @@ def create(self, request): crawler_manager.message_sender.send(TASK_TOPIC, message) return response + @swagger_auto_schema( + operation_summary="Run custom logic", + operation_description="This is the description of the `run` method.", + ) def update(self, request, pk=None): response = super().update(request, pk=pk) if response.status_code == status.HTTP_200_OK: @@ -36,6 +46,10 @@ def update(self, request, pk=None): crawler_manager.message_sender.send(TASK_TOPIC, message) return response + @swagger_auto_schema( + operation_summary="Run custom logic", + operation_description="This is the description of the `run` method.", + ) def partial_update(self, request, pk=None): response = super().partial_update(request, pk=pk) if response.status_code == status.HTTP_200_OK: @@ -46,6 +60,10 @@ def partial_update(self, request, pk=None): crawler_manager.message_sender.send(TASK_TOPIC, message) return response + @swagger_auto_schema( + operation_summary="Run custom logic", + operation_description="This is the description of the `run` method.", + ) def destroy(self, request, pk=None): response = super().destroy(request, pk=pk) if response.status_code == status.HTTP_204_NO_CONTENT: @@ -57,7 +75,11 @@ def destroy(self, request, pk=None): } crawler_manager.message_sender.send(TASK_TOPIC, message) return response - + + @swagger_auto_schema( + operation_summary="Run custom logic", + operation_description="This is the description of the `run` method.", + ) def __str2date(self, s: str) -> datetime: date = None @@ -68,7 +90,11 @@ def __str2date(self, s: str) -> datetime: print(e) return date - + + @swagger_auto_schema( + operation_summary="Run custom logic", + operation_description="This is the description of the `run` method.", + ) @action(detail=False) def filter(self, request): query_params = self.request.query_params.dict() diff --git a/interface/settings.py b/interface/settings.py index ccc5caab..fb37e6f4 100644 --- a/interface/settings.py +++ b/interface/settings.py @@ -59,6 +59,7 @@ 'django.contrib.staticfiles', 'main.apps.MainConfig', 'crispy_forms', + 'drf_yasg', 'rest_framework', 'mathfilters', 'api.apps.ApiConfig' From eea26cee92d6c69e36d4139fbe6f8b900c182b16 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Mon, 12 Jun 2023 14:31:06 -0300 Subject: [PATCH 76/89] Updates the API docs --- api/urls.py | 2 +- api/views/crawler.py | 149 ++++++++++++++++ api/views/crawler_instance.py | 313 +++++++++++++++++++++++++++++++++- api/views/crawler_queue.py | 169 +++++++++++++++++- api/views/task.py | 120 ++++++++++--- crawler_manager/settings.py | 6 +- 6 files changed, 720 insertions(+), 39 deletions(-) diff --git a/api/urls.py b/api/urls.py index 09cc3dbe..1abf78c1 100644 --- a/api/urls.py +++ b/api/urls.py @@ -41,7 +41,7 @@ path('instance/', views.CrawlerInstanceViewSet.as_view(only_retrieve_action), name='instance-detail'), # instance config export - path('instance//export/config', views.CrawlerInstanceViewSet.as_view({'get': 'export_config'}), name='instance-export-config'), + path('instance//config', views.CrawlerInstanceViewSet.as_view({'get': 'export_config'}), name='instance-config'), # instance update file download status path('instance//file/found/', views.CrawlerInstanceViewSet.as_view({'get': 'files_found'}), name='instance-files-found'), diff --git a/api/views/crawler.py b/api/views/crawler.py index 88a1a519..04560bdb 100644 --- a/api/views/crawler.py +++ b/api/views/crawler.py @@ -32,9 +32,158 @@ def _create_templated_url_response_handlers(self, response_handlers, crawler_id) handler['injection_type'] = 'templated_url' ResponseHandler.objects.create(**handler) + @swagger_auto_schema( + operation_summary='Retorna a lista de coletores.', + operation_description='Ao chamar por esse endpoint, uma lista de coletores será retornada em formato JSON.', + responses={ + 200: openapi.Response( + description='Lista de coletores.', + schema=openapi.Schema( + type=openapi.TYPE_ARRAY, + items=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + 'id': openapi.Schema( + type=openapi.TYPE_INTEGER, + description='ID único do coletor.' + ) + } + ) + ) + ) + } + ) + def list(self, request, *args, **kwargs): + return super().list(request, *args, **kwargs) + + @swagger_auto_schema( + operation_summary='Retorna um coletor.', + operation_description='Ao chamar por esse endpoint, um coletor será retornado em formato JSON.', + responses={ + 200: openapi.Response( + description='Coletor.', + schema=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + 'id': openapi.Schema( + type=openapi.TYPE_INTEGER, + description='ID único do coletor.' + ) + } + ) + ) + }, + manual_parameters=[ + openapi.Parameter( + name='id', + in_=openapi.IN_PATH, + type=openapi.TYPE_INTEGER, + description='ID único do coletor.', + required=True + ) + ] + ) + def retrieve(self, request, *args, **kwargs): + return super().retrieve(request, *args, **kwargs) + + @swagger_auto_schema( + operation_summary='Atualiza um coletor.', + operation_description='Ao chamar por esse endpoint, um coletor será atualizado e retornado em formato JSON.', + request_body=openapi.Schema( + type=openapi.TYPE_OBJECT, + ), + responses={ + 200: openapi.Response( + description='Coletor atualizado com sucesso.', + schema=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + 'id': openapi.Schema( + type=openapi.TYPE_INTEGER, + description='ID único do coletor.' + ) + } + ) + ), + 400: openapi.Response( + description='Erro na requisição.', + schema=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + 'error': openapi.Schema( + type=openapi.TYPE_STRING, + description='Mensagem de erro.' + ) + } + ) + ) + }, + manual_parameters=[ + openapi.Parameter( + name='id', + in_=openapi.IN_PATH, + type=openapi.TYPE_INTEGER, + description='ID único do coletor.', + required=True + ) + ] + ) + def update(self, request, *args, **kwargs): + return super().update(request, *args, **kwargs) + + @swagger_auto_schema( + operation_summary='Remove um coletor.', + operation_description='Ao chamar por esse endpoint, um coletor será removido.', + responses={ + 204: openapi.Response( + description='Coletor removido com sucesso.' + ) + }, + manual_parameters=[ + openapi.Parameter( + name='id', + in_=openapi.IN_PATH, + type=openapi.TYPE_INTEGER, + description='ID único do coletor.', + required=True + ) + ] + ) + def destroy(self, request, *args, **kwargs): + return super().destroy(request, *args, **kwargs) + @swagger_auto_schema( operation_summary='Cria um novo coletor.', operation_description='Ao chamar por esse endpoint, um novo coletor será criado e retornado em formato JSON.', + request_body=openapi.Schema( + type=openapi.TYPE_OBJECT, + ), + responses={ + 201: openapi.Response( + description='Coletor criado com sucesso.', + schema=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + 'id': openapi.Schema( + type=openapi.TYPE_INTEGER, + description='ID único do coletor.' + ) + } + ) + ), + 400: openapi.Response( + description='Erro na requisição.', + schema=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + 'error': openapi.Schema( + type=openapi.TYPE_STRING, + description='Mensagem de erro.' + ) + } + ) + ) + } ) def create(self, request, *args, **kwargs): """ diff --git a/api/views/crawler_instance.py b/api/views/crawler_instance.py index c3da0148..efe500ab 100644 --- a/api/views/crawler_instance.py +++ b/api/views/crawler_instance.py @@ -11,13 +11,14 @@ from rest_framework import status from django.http import FileResponse, JsonResponse from typing_extensions import Literal +from drf_yasg.utils import swagger_auto_schema +from drf_yasg import openapi from main.models import CrawlerInstance, CrawlRequest from main.utils import process_stop_crawl from main.serializers import CrawlerInstanceSerializer from crawler_manager.settings import OUTPUT_FOLDER - class CrawlerInstanceViewSet(viewsets.ReadOnlyModelViewSet): """ A simple ViewSet for viewing and listing instances @@ -25,6 +26,84 @@ class CrawlerInstanceViewSet(viewsets.ReadOnlyModelViewSet): queryset = CrawlerInstance.objects.all() serializer_class = CrawlerInstanceSerializer + @swagger_auto_schema( + operation_summary='Retorna um arquvivo json com a representação das instâncias.', + operation_description='Retorna o estado das instâncias serializado em json.', + responses={ + 200: openapi.Response( + description='Instâncias encontradas.', + schema=openapi.Schema( + type=openapi.TYPE_OBJECT, + ) + ), + 404: openapi.Response( + description='Instâncias não encontradas.', + schema=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + 'error': openapi.Schema( + type=openapi.TYPE_STRING, + description='Mensagem de erro.' + ) + } + ) + ) + } + ) + def list(self, request): + return super().list(request) + + @swagger_auto_schema( + operation_summary='Retorna um arquvivo json com a representação da instância.', + operation_description='Retorna o estado da instância serializado em json.', + responses={ + 200: openapi.Response( + description='Instância encontrada.', + schema=openapi.Schema( + type=openapi.TYPE_OBJECT, + ) + ), + 404: openapi.Response( + description='Instância não encontrada.', + schema=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + 'error': openapi.Schema( + type=openapi.TYPE_STRING, + description='Mensagem de erro.' + ) + } + ) + ) + } + ) + def retrieve(self, request, pk=None): + return super().retrieve(request, pk) + + @swagger_auto_schema( + operation_summary='Retorna um arquvivo json com a representação da instância.', + operation_description='Retorna o estado da instância serializado em json.', + responses={ + 200: openapi.Response( + description='Instância encontrada.', + schema=openapi.Schema( + type=openapi.TYPE_OBJECT, + ) + ), + 404: openapi.Response( + description='Instância não encontrada.', + schema=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + 'error': openapi.Schema( + type=openapi.TYPE_STRING, + description='Mensagem de erro.' + ) + } + ) + ) + } + ) @action(detail=True, methods=['get']) def export_config(self, request, pk): try: @@ -120,42 +199,180 @@ def _update_page_info(self, instance_id, val: int = 1): return self._update_download_info(instance_id, 'page', operation, val) + @swagger_auto_schema( + operation_summary='Atualiza a quantidade de links de arquivos encontrados em determinada página.', + operation_description='Esse endpoint recebe com parâmetro `num_files`, que é utilizado para atualizar a quantidade de links de arquivos encontrados em determinada página.', + operation_id='instance_update_files_found', + manual_parameters=[ + openapi.Parameter( + 'instance_id', + openapi.IN_PATH, + description='ID único da instância.', + type=openapi.TYPE_INTEGER + ), + ] + ) @action(detail=True, methods=['get']) def files_found(self, request, pk, num_files): return self._update_file_info(pk, 'found', num_files) - + + @swagger_auto_schema( + operation_summary='Incrementa a quantidade de arquivos baixados com sucesso em determinada instância.', + operation_description='Esse endpoint incrementa a quantidade de arquivos baixados com sucesso em determinada instância em 1 unidade toda vez que é chamado.', + operation_id='instance_update_file_success', + manual_parameters=[ + openapi.Parameter( + 'instance_id', + openapi.IN_PATH, + description='ID único da instância.', + type=openapi.TYPE_INTEGER + ), + ] + ) @action(detail=True, methods=['get']) def file_success(self, request, pk): return self._update_file_info(pk, 'success') - + + @swagger_auto_schema( + operation_summary='Incrementa a quantidade de arquivos com status já baixados em determinada instância.', + operation_description='Esse endpoint incrementa a quantidade de arquivos com status já baixados em determinada instância em 1 unidade toda vez que é chamado.', + operation_id='instance_update_file_previously', + manual_parameters=[ + openapi.Parameter( + 'instance_id', + openapi.IN_PATH, + description='ID único da instância.', + type=openapi.TYPE_INTEGER + ), + ] + ) @action(detail=True, methods=['get']) def file_previously(self, request, pk): return self._update_file_info(pk, 'previously') - + + @swagger_auto_schema( + operation_summary='Incrementa a quantidade de arquivos com erros de download em determinada instância.', + operation_description='Esse endpoint incrementa a quantidade de arquivos com erros de download em determinada instância em 1 unidade toda vez que é chamado.', + operation_id='instance_update_error', + manual_parameters=[ + openapi.Parameter( + 'instance_id', + openapi.IN_PATH, + description='ID único da instância.', + type=openapi.TYPE_INTEGER + ), + ] + ) @action(detail=True, methods=['get']) def file_error(self, request, pk): return self._update_file_info(pk, 'error') - + + @swagger_auto_schema( + operation_summary='Atualiza a quantidade de links de páginas encontrados para serem exploradas.', + operation_description='Esse endpoint recebe com parâmetro `num_files`, que é utilizado para atualizar a quantidade de links de páginas encontrados para serem exploradas.', + operation_id='instance_update_pages_found', + manual_parameters=[ + openapi.Parameter( + 'instance_id', + openapi.IN_PATH, + description='ID único da instância.', + type=openapi.TYPE_INTEGER + ), + ] + ) @action(detail=True, methods=['get']) def pages_found(self, request, pk, num_files): return self._update_page_info(pk, 'found', num_files) + @swagger_auto_schema( + operation_summary='Incrementa a quantidade de páginas exploradas com sucesso em determinada instância.', + operation_description='Esse endpoint incrementa a quantidade de páginas exploradas com sucesso em determinada instância em 1 unidade toda vez que é chamado.', + operation_id='instance_update_page_success', + manual_parameters=[ + openapi.Parameter( + 'instance_id', + openapi.IN_PATH, + description='ID único da instância.', + type=openapi.TYPE_INTEGER + ), + ] + ) @action(detail=True, methods=['get']) def page_success(self, request, pk): return self._update_page_info(pk, 'success') - + + @swagger_auto_schema( + operation_summary='Incrementa a quantidade de páginas com status já exploradas em determinada instância.', + operation_description='Esse endpoint incrementa a quantidade de páginas com status já exploradas em determinada instância em 1 unidade toda vez que é chamado.', + operation_id='instance_update_page_previously', + manual_parameters=[ + openapi.Parameter( + 'instance_id', + openapi.IN_PATH, + description='ID único da instância.', + type=openapi.TYPE_INTEGER + ), + ] + ) @action(detail=True, methods=['get']) def page_previously(self, request, pk): return self._update_page_info(pk, 'previously') + @swagger_auto_schema( + operation_summary='Incrementa a quantidade de páginas com status duplicado em determinada instância.', + operation_description='Esse endpoint incrementa a quantidade de páginas com status duplicado em determinada instância em 1 unidade toda vez que é chamado.', + operation_id='instance_update_page_duplicated', + manual_parameters=[ + openapi.Parameter( + 'instance_id', + openapi.IN_PATH, + description='ID único da instância.', + type=openapi.TYPE_INTEGER + ), + ] + ) @action(detail=True, methods=['get']) def page_duplicated(self, request, pk): return self._update_page_info(pk, 'duplicated') + @swagger_auto_schema( + operation_summary='Incrementa a quantidade de páginas com erros de exploração em determinada instância.', + operation_description='Esse endpoint incrementa a quantidade de páginas com erros de exploração em determinada instância em 1 unidade toda vez que é chamado.', + operation_id='instance_update_page_error', + manual_parameters=[ + openapi.Parameter( + 'instance_id', + openapi.IN_PATH, + description='ID único da instância.', + type=openapi.TYPE_INTEGER + ), + ] + ) @action(detail=True, methods=['get']) def page_error(self, request, pk): return self._update_page_info(pk, 'error') + @swagger_auto_schema( + operation_summary='Obtêm logs brutos de erros.', + operation_description='Esse endpoint obtêm os logs brutos de erros durante a execução de determinada instância.', + operation_id='instance_raw_log_err', + manual_parameters=[ + openapi.Parameter( + 'instance_id', + openapi.IN_PATH, + description='ID único da instância.', + type=openapi.TYPE_INTEGER + ), + openapi.Parameter( + 'n_lines', + openapi.IN_QUERY, + description='Número de linhas de logs a serem retornadas.', + type=openapi.TYPE_INTEGER, + default=100, + required=False + ), + ] + ) def raw_log_err(self, request, pk): try: instance = CrawlerInstance.objects.get(instance_id=pk) @@ -184,6 +401,27 @@ def raw_log_err(self, request, pk): return resp + @swagger_auto_schema( + operation_summary='Obtêm logs brutos de saída.', + operation_description='Esse endpoint obtêm os logs brutos de saída durante a execução de determinada instância.', + operation_id='instance_raw_log_out', + manual_parameters=[ + openapi.Parameter( + 'instance_id', + openapi.IN_PATH, + description='ID único da instância.', + type=openapi.TYPE_INTEGER + ), + openapi.Parameter( + 'n_lines', + openapi.IN_QUERY, + description='Número de linhas de logs a serem retornadas.', + type=openapi.TYPE_INTEGER, + default=100, + required=False + ), + ] + ) def raw_log_out(self, request, pk): try: instance = CrawlerInstance.objects.get(instance_id=pk) @@ -211,6 +449,27 @@ def raw_log_out(self, request, pk): return resp + @swagger_auto_schema( + operation_summary='Obtêm logs de saída ou erro do sistema.', + operation_description='Esse endpoint obtêm os logs de saída ou erro do sistema durante a execução de determinada instância.', + operation_id='instance_log', + manual_parameters=[ + openapi.Parameter( + 'instance_id', + openapi.IN_PATH, + description='ID único da instância.', + type=openapi.TYPE_INTEGER + ), + openapi.Parameter( + 'n_lines', + openapi.IN_QUERY, + description='Número de linhas de logs a serem retornadas.', + type=openapi.TYPE_INTEGER, + default=100, + required=False + ) + ] + ) @action(detail=True, methods=['get']) def tail(self, request, pk): try: @@ -262,6 +521,35 @@ def tail(self, request, pk): 'time': str(datetime.fromtimestamp(time.time())), }, status=status.HTTP_200_OK) + @swagger_auto_schema( + operation_summary='Obtêm screenshots de determinada instância.', + operation_description='Esse endpoint obtêm os screenshots de determinada instância.', + operation_id='instance_screenshots', + manual_parameters=[ + openapi.Parameter( + 'instance_id', + openapi.IN_PATH, + description='ID único da instância.', + type=openapi.TYPE_INTEGER + ), + openapi.Parameter( + 'imgs_per_page', + openapi.IN_QUERY, + description='Número de imagens por página.', + type=openapi.TYPE_INTEGER, + default=20, + required=False + ), + openapi.Parameter( + 'page', + openapi.IN_QUERY, + description='Número da página.', + type=openapi.TYPE_INTEGER, + default=1, + required=False + ) + ] + ) @action(detail=True, methods=['get']) def screenshots(request, pk): try: @@ -312,6 +600,19 @@ def screenshots(request, pk): 'total_screenshots': total_screenshots }, status=status.HTTP_200_OK) + @swagger_auto_schema( + operation_summary='Obtêm o arquivo de tracing de execução de coletores dinâmicos.', + operation_description='Esse endpoint obtêm o arquivo de tracing de execução de coletores dinâmicos.', + operation_id='instance_export_trace', + manual_parameters=[ + openapi.Parameter( + 'instance_id', + openapi.IN_PATH, + description='ID único da instância.', + type=openapi.TYPE_INTEGER + ) + ] + ) @action(detail=True, methods=['get']) def export_trace(self, request, pk): try: diff --git a/api/views/crawler_queue.py b/api/views/crawler_queue.py index ef53d453..b637cc4c 100644 --- a/api/views/crawler_queue.py +++ b/api/views/crawler_queue.py @@ -4,18 +4,95 @@ from rest_framework.decorators import action from rest_framework.response import Response +from drf_yasg.utils import swagger_auto_schema +from drf_yasg import openapi + from main.models import CrawlerQueue, CrawlerQueueItem from main.serializers import CrawlerQueueSerializer from main.utils import (process_run_crawl, unqueue_crawl_requests, CRAWLER_QUEUE) + + class CrawlerQueueViewSet(viewsets.ModelViewSet): queryset = CrawlerQueue.objects.all() serializer_class = CrawlerQueueSerializer http_method_names = ['get', 'put'] + + @swagger_auto_schema( + operation_summary='Retorna a fila de execução', + operation_description='Retorna a fila de execução', + responses={ + 200: openapi.Response( + description='Fila de execução retornada com sucesso', + examples={ + 'application/json': { + 'success': True, + 'data': { + 'queue': [ + { + 'id': 1, + 'url': 'http://www.google.com', + 'status': 'pending', + 'created_at': '2020-01-01T00:00:00Z', + 'updated_at': '2020-01-01T00:00:00Z' + } + ] + } + } + } + ) + } + ) def retrieve(self, request): crawler_queue = CrawlerQueue.to_dict() return Response(crawler_queue) - + + @swagger_auto_schema( + operation_summary='Troca a posição de dois itens da fila de execução', + operation_description='Troca a posição do item A com o item B na fila de execução', + manual_parameters=[ + openapi.Parameter( + name='a', + in_=openapi.IN_QUERY, + description='ID do item A', + required=True, + type=openapi.TYPE_INTEGER + ), + openapi.Parameter( + name='b', + in_=openapi.IN_QUERY, + description='ID do item B', + required=True, + type=openapi.TYPE_INTEGER + ) + ], + responses={ + 200: openapi.Response( + description='Posições trocadas com sucesso', + examples={ + 'application/json': { + 'success': True + } + } + ), + 400: openapi.Response( + description='Os itens devem estar na mesma fila', + examples={ + 'application/json': { + 'error': 'Crawler queue items must be in same queue!' + } + } + ), + 404: openapi.Response( + description='Item A ou B não encontrado', + examples={ + 'application/json': { + 'error': 'Crawler queue item 1 not found!' + } + } + ) + } + ) @action(detail=False, methods=['get']) def switch_position(self, request, a: int, b: int): with transaction.atomic(): @@ -44,6 +121,38 @@ def switch_position(self, request, a: int, b: int): return Response(status=status.HTTP_200_OK) + @swagger_auto_schema( + operation_summary='Executa um crawler imediatamente', + operation_description='Executa um crawler imediatamente, ignorando a fila de execução', + manual_parameters=[ + openapi.Parameter( + name='item_id', + in_=openapi.IN_QUERY, + description='ID do item da fila de execução', + required=True, + type=openapi.TYPE_INTEGER + ) + ], + responses={ + 200: openapi.Response( + description='Crawler executado com sucesso', + examples={ + 'application/json': { + 'crawler_id': 1, + 'instance_id': 1 + } + } + ), + 404: openapi.Response( + description='Item da fila de execução não encontrado', + examples={ + 'application/json': { + 'error': 'Crawler queue item 1 not found!' + } + } + ) + } + ) @action(detail=False, methods=['get']) def force_execution(self, request, item_id: int): with transaction.atomic(): @@ -68,6 +177,32 @@ def force_execution(self, request, item_id: int): return Response(data, status=status.HTTP_200_OK) + @swagger_auto_schema( + operation_summary='Remove um item da fila de execução', + operation_description='Remove um item da fila de execução', + manual_parameters=[ + openapi.Parameter( + name='item_id', + in_=openapi.IN_QUERY, + description='ID do item da fila de execução', + required=True, + type=openapi.TYPE_INTEGER + ) + ], + responses={ + 204: openapi.Response( + description='Item removido com sucesso' + ), + 404: openapi.Response( + description='Item da fila de execução não encontrado', + examples={ + 'application/json': { + 'error': 'Crawler queue item 1 not found!' + } + } + ) + } + ) @action(detail=False, methods=['get']) def remove_item(self, request, item_id: int): try: @@ -82,6 +217,38 @@ def remove_item(self, request, item_id: int): def get_object(self): return CrawlerQueue.object() + @swagger_auto_schema( + operation_summary='Atualiza as configurações da fila de execução', + operation_description='Atualiza as configurações da fila de execução', + request_body=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + 'max_fast_runtime_crawlers_running': openapi.Schema( + type=openapi.TYPE_INTEGER, + description='Número máximo de crawlers rápidos em execução', + minimum=1, + maximum=100 + ), + 'max_medium_runtime_crawlers_running': openapi.Schema( + type=openapi.TYPE_INTEGER, + description='Número máximo de crawlers médios em execução', + minimum=1, + maximum=100 + ), + 'max_slow_runtime_crawlers_running': openapi.Schema( + type=openapi.TYPE_INTEGER, + description='Número máximo de crawlers lentos em execução', + minimum=1, + maximum=100 + ) + } + ), + responses={ + 200: openapi.Response( + description='Configurações atualizadas com sucesso' + ) + } + ) def update(self, request): response = super().update(request) diff --git a/api/views/task.py b/api/views/task.py index 72f0f1d2..07315f6f 100644 --- a/api/views/task.py +++ b/api/views/task.py @@ -5,22 +5,57 @@ from rest_framework.response import Response from rest_framework.decorators import action +from drf_yasg.utils import swagger_auto_schema +from drf_yasg import openapi + from main.models import Task from main.serializers import TaskSerializer from main.task_filter import task_filter_by_date_interval import crawler_manager.crawler_manager as crawler_manager from crawler_manager.settings import TASK_TOPIC - -from drf_yasg.utils import swagger_auto_schema class TaskViewSet(viewsets.ModelViewSet): queryset = Task.objects.all() serializer_class = TaskSerializer @swagger_auto_schema( - operation_summary="Run custom logic", - operation_description="This is the description of the `run` method.", + operation_summary="Obtêm todos agendamentos de coletas.", + operation_description="Este endpoint obtêm todos agendamentos de coletas.", + responses={ + 200: 'OK' + } + ) + def list(self, request, *args, **kwargs): + return super().list(request, *args, **kwargs) + + @swagger_auto_schema( + operation_summary="Obtêm um agendamento de coleta.", + operation_description="Este endpoint obtêm um agendamento de coleta.", + manual_parameters=[ + openapi.Parameter( + name='id', + in_=openapi.IN_QUERY, + description='ID único do agendamento de coleta', + required=True, + type=openapi.TYPE_INTEGER + ) + ], + responses={ + 200: 'OK', + 404: 'Not Found' + } + ) + def retrieve(self, request, pk=None): + return super().retrieve(request, pk=pk) + + @swagger_auto_schema( + operation_summary="Cria um novo agendamento de coleta.", + operation_description="Este endpoint cria um novo agendamento de coleta.", + responses={ + 201: 'Created', + 400: 'Bad Request' + } ) def create(self, request): response = super().create(request) @@ -33,8 +68,22 @@ def create(self, request): return response @swagger_auto_schema( - operation_summary="Run custom logic", - operation_description="This is the description of the `run` method.", + operation_summary="Atualiza um agendamento de coleta.", + operation_description="Este endpoint atualiza um agendamento de coleta.", + manual_parameters=[ + openapi.Parameter( + name='id', + in_=openapi.IN_QUERY, + description='ID único do agendamento de coleta', + required=True, + type=openapi.TYPE_INTEGER + ) + ], + responses={ + 200: 'OK', + 400: 'Bad Request', + 404: 'Not Found' + } ) def update(self, request, pk=None): response = super().update(request, pk=pk) @@ -47,22 +96,21 @@ def update(self, request, pk=None): return response @swagger_auto_schema( - operation_summary="Run custom logic", - operation_description="This is the description of the `run` method.", - ) - def partial_update(self, request, pk=None): - response = super().partial_update(request, pk=pk) - if response.status_code == status.HTTP_200_OK: - message = { - 'action': 'update', - 'data': response.data - } - crawler_manager.message_sender.send(TASK_TOPIC, message) - return response - - @swagger_auto_schema( - operation_summary="Run custom logic", - operation_description="This is the description of the `run` method.", + operation_summary="Remove um agendamento de coleta.", + operation_description="Remove um agendamento de coleta.", + manual_parameters=[ + openapi.Parameter( + name='id', + in_=openapi.IN_QUERY, + description='ID único do agendamento de coleta', + required=True, + type=openapi.TYPE_INTEGER + ) + ], + responses={ + 204: 'No Content', + 404: 'Not Found' + } ) def destroy(self, request, pk=None): response = super().destroy(request, pk=pk) @@ -76,10 +124,6 @@ def destroy(self, request, pk=None): crawler_manager.message_sender.send(TASK_TOPIC, message) return response - @swagger_auto_schema( - operation_summary="Run custom logic", - operation_description="This is the description of the `run` method.", - ) def __str2date(self, s: str) -> datetime: date = None @@ -92,8 +136,28 @@ def __str2date(self, s: str) -> datetime: return date @swagger_auto_schema( - operation_summary="Run custom logic", - operation_description="This is the description of the `run` method.", + operation_summary="Filtra agendamentos de coleta por intervalo de datas.", + operation_description="Filtra agendamentos de coleta por intervalo de datas.", + manual_parameters=[ + openapi.Parameter( + name='start_date', + in_=openapi.IN_QUERY, + description='Data de início do intervalo', + required=True, + type=openapi.TYPE_STRING + ), + openapi.Parameter( + name='end_date', + in_=openapi.IN_QUERY, + description='Data de fim do intervalo', + required=True, + type=openapi.TYPE_STRING + ) + ], + responses={ + 200: 'OK', + 400: 'Bad Request' + } ) @action(detail=False) def filter(self, request): diff --git a/crawler_manager/settings.py b/crawler_manager/settings.py index 6a07b7a3..d25071ee 100644 --- a/crawler_manager/settings.py +++ b/crawler_manager/settings.py @@ -3,7 +3,7 @@ # Kafka host information KAFKA_TOPIC_PREFIX = os.getenv('KAFKA_TOPIC_PREFIX', 'crawler_ufmg') -KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'kafka:9092').split(',')] +KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'localhost:9092').split(',')] KAFKA_CONSUMER_AUTO_OFFSET_RESET = 'earliest' KAFKA_CONSUMER_TIMEOUT = 120000 KAFKA_CONSUMER_COMMIT_INTERVAL_MS = 5000 @@ -16,7 +16,7 @@ KAFKA_SESSION_TIMEOUT_MS = 2 * 60 * 1000 # Redis host information -REDIS_HOST = os.getenv('REDIS_HOST', 'redis') +REDIS_HOST = os.getenv('REDIS_HOST', 'localhost') REDIS_PORT = int(os.getenv('REDIS_PORT', 6379)) REDIS_DB = int(os.getenv('REDIS_DB', 0)) REDIS_PASSWORD = os.getenv('REDIS_PASSWORD', None) @@ -39,7 +39,7 @@ WRITER_TOPIC = os.getenv('WRITER_TOPIC', KAFKA_TOPIC_PREFIX + '.writer') STOPPED_SPIDER_NOTIFICATION_ADDRESS = os.getenv( - 'STOPPED_SPIDER_NOTIFICATION_ADDRESS', 'http://web:8000/detail/stop_crawl/{crawler_id}') + 'STOPPED_SPIDER_NOTIFICATION_ADDRESS', 'http://localhost:8000/detail/stop_crawl/{crawler_id}') TASK_TOPIC = os.getenv('TASK_TOPIC', KAFKA_TOPIC_PREFIX + 'task_topic') TASK_DATA_CONSUMER_GROUP = os.getenv('TASK_DATA_CONSUMER_DATA', KAFKA_TOPIC_PREFIX + '.task_data_group') From 040da799176c438e8d1b858f96bcc7b9f6d04575 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Tue, 13 Jun 2023 16:10:44 -0300 Subject: [PATCH 77/89] Updates the interface with the new endpoints --- api/views/crawler_instance.py | 6 +- api/views/crawler_queue.py | 1 - main/staticfiles/js/details.js | 7 +- main/staticfiles/js/screenshot_viewer.js | 95 ++++++++++++++---------- main/templates/main/detail_crawler.html | 8 +- 5 files changed, 69 insertions(+), 48 deletions(-) diff --git a/api/views/crawler_instance.py b/api/views/crawler_instance.py index efe500ab..3a84a903 100644 --- a/api/views/crawler_instance.py +++ b/api/views/crawler_instance.py @@ -551,12 +551,14 @@ def tail(self, request, pk): ] ) @action(detail=True, methods=['get']) - def screenshots(request, pk): + def screenshots(self, request, pk): try: instance = CrawlerInstance.objects.get(pk=pk) # get_object_or_404(CrawlerInstance, pk=instance_id) except: - return Response(status=status.HTTP_404_NOT_FOUND) + return Response({'error': 'Instance not found.', + 'total_screenshots': 0}, + status=status.HTTP_404_NOT_FOUND) imgs_per_page = int(request.GET.get('imgs_per_page', 20)) page = int(request.GET.get('page', 1)) diff --git a/api/views/crawler_queue.py b/api/views/crawler_queue.py index b637cc4c..1cd5dca1 100644 --- a/api/views/crawler_queue.py +++ b/api/views/crawler_queue.py @@ -12,7 +12,6 @@ from main.utils import (process_run_crawl, unqueue_crawl_requests, CRAWLER_QUEUE) - class CrawlerQueueViewSet(viewsets.ModelViewSet): queryset = CrawlerQueue.objects.all() serializer_class = CrawlerQueueSerializer diff --git a/main/staticfiles/js/details.js b/main/staticfiles/js/details.js index 52a8e3b4..0f1a7f0a 100644 --- a/main/staticfiles/js/details.js +++ b/main/staticfiles/js/details.js @@ -1,4 +1,6 @@ statusInterval = null //global var to control call interval +server_address = window.location.origin; + document.addEventListener('DOMContentLoaded', function () { var instance_id = document.getElementById("last_instance_id").innerText.trim(); @@ -35,9 +37,8 @@ function tail_logs(instance_id){ let progress_page_failure = $('#progress-page-failure'); let progress_page_duplicated = $('#progress-page-duplicated'); let progress_page_previously_crawled = $('#progress-page-previously-crawled'); - // calls tail log view and set logs - $.ajax("/tail_log_file/" + instance_id).done(function(data) { + $.ajax(`${server_address}/api/instance/${instance_id}/log/tail`).done(function(data) { var response = data; if (response["files_found"] != 0) { @@ -155,7 +156,7 @@ function status_instance(instance_id){ } }; - xhr.open("GET", "/api/instances/"+instance_id, true); + xhr.open("GET", "/api/instance/"+instance_id, true); xhr.send(); } diff --git a/main/staticfiles/js/screenshot_viewer.js b/main/staticfiles/js/screenshot_viewer.js index fa4c51f9..ce61b5e4 100644 --- a/main/staticfiles/js/screenshot_viewer.js +++ b/main/staticfiles/js/screenshot_viewer.js @@ -32,50 +32,69 @@ function displayScreenshotModal(instance_id) { $("#screenshot_modal .pagination").empty(); $("#screenshot_modal .screenshot_list").empty(); - $.ajax("/info/screenshots/" + instance_id + "/1").done(function (data) { - if ("error" in data) { - $("#screenshot_modal .screenshot_list").text(data['error']); - $("#screenshot_modal .spinner-border").hide(); - return; - } + let server_address = window.location.origin; + $("#screenshot_modal").modal("show"); - let items_per_page = data["data"].length; + $.ajax({ + url: `${server_address}/api/instance/${instance_id}/debug/screenshots`, + type: 'get', + dataType: 'json', + // async: false, + success: function (data) { + data = data.responseJSON; + + let items_per_page = data["data"].length; - $("#screenshot_modal .pagination").paging(data['total_screenshots'], { - format: '[< ncnnn >]', - perpage: items_per_page, - lapping: 0, - page: 1, - onSelect: function (page) { - $("#screenshot_modal .spinner-border").show(); - $("#screenshot_modal .screenshot_list").nanogallery2('destroy'); - $("#screenshot_modal .screenshot_list").empty(); - $.ajax("/info/screenshots/" + instance_id + "/" + page) - .done(function(new_data) { - updateGallery(new_data["data"]); - $("#screenshot_modal .spinner-border").hide(); - }); - }, - onFormat: function (type) { - switch (type) { - case 'block': // n and c - return '' + this.value + ''; - case 'next': // > - return ''; - case 'prev': // < - return ''; - case 'first': // [ - return ''; - case 'last': // ] - return ''; + $("#screenshot_modal .pagination").paging(data['total_screenshots'], { + format: '[< ncnnn >]', + perpage: items_per_page, + lapping: 0, + page: 1, + onSelect: function (page) { + $("#screenshot_modal .spinner-border").show(); + $("#screenshot_modal .screenshot_list").nanogallery2('destroy'); + $("#screenshot_modal .screenshot_list").empty(); + $.ajax("/info/screenshots/" + instance_id + "/" + page) + .done(function (new_data) { + updateGallery(new_data["data"]); + $("#screenshot_modal .spinner-border").hide(); + }); + }, + onFormat: function (type) { + switch (type) { + case 'block': // n and c + return '' + this.value + ''; + case 'next': // > + return ''; + case 'prev': // < + return ''; + case 'first': // [ + return ''; + case 'last': // ] + return ''; + } } - } - }); + }).fail(function () { + $("#screenshot_modal .screenshot_list").text("Failed to load screenshots"); + }); - $("#screenshot_modal .spinner-border").hide(); + $("#screenshot_modal .spinner-border").hide(); + // $("#screenshot_modal").modal("show"); + }, + error: function (data) { + data = data.responseJSON; + $("#screenshot_modal .screenshot_list").text(data['error']); + $("#screenshot_modal .spinner-border").hide(); + // $("#screenshot_modal").modal("hide"); + } }); - $("#screenshot_modal").modal("show"); + // $.ajax().done(function (data) { + // if ("error" in data) { + // return; + // } + // }); + } $('#screenshot_modal').on('hidden.bs.modal', function () { diff --git a/main/templates/main/detail_crawler.html b/main/templates/main/detail_crawler.html index 1a594d67..c13296ba 100644 --- a/main/templates/main/detail_crawler.html +++ b/main/templates/main/detail_crawler.html @@ -113,10 +113,10 @@

    Instances:

    {{instance.duration_readable}} {{instance.num_data_files}} arquivos {{instance.data_size_readable}} - Baixar + Baixar {% if crawler.dynamic_processing %} - Baixar + Baixar {% endif %} @@ -251,7 +251,7 @@
    Progresso de download de arquivos
    Últimas linhas do stdout
    - Abrir Log Bruto + Abrir Log Bruto
    @@ -275,7 +275,7 @@
    Últimas linhas do stdout
    Últimas linhas do stderr
    - Abrir Log Bruto + Abrir Log Bruto
    From 46cc5437b112680b52125953ca2d1dead07f1cc6 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Tue, 13 Jun 2023 16:54:39 -0300 Subject: [PATCH 78/89] Fix error in screenshot viewer --- crawler_manager/settings.py | 6 +++--- main/staticfiles/js/screenshot_viewer.js | 4 +--- requirements.txt | 1 + 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/crawler_manager/settings.py b/crawler_manager/settings.py index d25071ee..6a07b7a3 100644 --- a/crawler_manager/settings.py +++ b/crawler_manager/settings.py @@ -3,7 +3,7 @@ # Kafka host information KAFKA_TOPIC_PREFIX = os.getenv('KAFKA_TOPIC_PREFIX', 'crawler_ufmg') -KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'localhost:9092').split(',')] +KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'kafka:9092').split(',')] KAFKA_CONSUMER_AUTO_OFFSET_RESET = 'earliest' KAFKA_CONSUMER_TIMEOUT = 120000 KAFKA_CONSUMER_COMMIT_INTERVAL_MS = 5000 @@ -16,7 +16,7 @@ KAFKA_SESSION_TIMEOUT_MS = 2 * 60 * 1000 # Redis host information -REDIS_HOST = os.getenv('REDIS_HOST', 'localhost') +REDIS_HOST = os.getenv('REDIS_HOST', 'redis') REDIS_PORT = int(os.getenv('REDIS_PORT', 6379)) REDIS_DB = int(os.getenv('REDIS_DB', 0)) REDIS_PASSWORD = os.getenv('REDIS_PASSWORD', None) @@ -39,7 +39,7 @@ WRITER_TOPIC = os.getenv('WRITER_TOPIC', KAFKA_TOPIC_PREFIX + '.writer') STOPPED_SPIDER_NOTIFICATION_ADDRESS = os.getenv( - 'STOPPED_SPIDER_NOTIFICATION_ADDRESS', 'http://localhost:8000/detail/stop_crawl/{crawler_id}') + 'STOPPED_SPIDER_NOTIFICATION_ADDRESS', 'http://web:8000/detail/stop_crawl/{crawler_id}') TASK_TOPIC = os.getenv('TASK_TOPIC', KAFKA_TOPIC_PREFIX + 'task_topic') TASK_DATA_CONSUMER_GROUP = os.getenv('TASK_DATA_CONSUMER_DATA', KAFKA_TOPIC_PREFIX + '.task_data_group') diff --git a/main/staticfiles/js/screenshot_viewer.js b/main/staticfiles/js/screenshot_viewer.js index ce61b5e4..8c21ea49 100644 --- a/main/staticfiles/js/screenshot_viewer.js +++ b/main/staticfiles/js/screenshot_viewer.js @@ -41,8 +41,6 @@ function displayScreenshotModal(instance_id) { dataType: 'json', // async: false, success: function (data) { - data = data.responseJSON; - let items_per_page = data["data"].length; $("#screenshot_modal .pagination").paging(data['total_screenshots'], { @@ -54,7 +52,7 @@ function displayScreenshotModal(instance_id) { $("#screenshot_modal .spinner-border").show(); $("#screenshot_modal .screenshot_list").nanogallery2('destroy'); $("#screenshot_modal .screenshot_list").empty(); - $.ajax("/info/screenshots/" + instance_id + "/" + page) + $.ajax(`${server_address}/api/instance/${instance_id}/debug/screenshots?page=${page}&imgs_per_page=${items_per_page}`) .done(function (new_data) { updateGallery(new_data["data"]); $("#screenshot_modal .spinner-border").hide(); diff --git a/requirements.txt b/requirements.txt index fa0bfe0c..c157d78d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,6 +28,7 @@ cssselect==1.1.0 cycler==0.11.0 decorator==5.1.1 django-crispy-forms==1.14.0 +drf-yasg==1.21.5 django-environ==0.8.1 django-mathfilters==1.0.0 djangorestframework==3.13.1 From 72243c1a2aae35d6c24cca9d33ea07de6a736eb9 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Tue, 13 Jun 2023 17:26:34 -0300 Subject: [PATCH 79/89] Setting the correct api url --- api/urls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/urls.py b/api/urls.py index 1abf78c1..06d4b6d7 100644 --- a/api/urls.py +++ b/api/urls.py @@ -1,6 +1,5 @@ from django.shortcuts import redirect from django.urls import path -from django.views.generic import TemplateView from drf_yasg import openapi from drf_yasg.views import get_schema_view from rest_framework import permissions @@ -23,6 +22,7 @@ license=openapi.License(name="BSD License"), ), public=True, + url='http://localhost:8000/api/', permission_classes=(permissions.AllowAny,), ) From 349b2f15c2531a16a57b95906da61dc0cbbafd8b Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Wed, 14 Jun 2023 11:52:29 -0300 Subject: [PATCH 80/89] Integrate instance exec video api with interface --- api/urls.py | 1 + api/views/crawler_instance.py | 47 +++++++++++++++++++--- main/staticfiles/js/details.js | 22 +++++++++- main/staticfiles/js/screenshot_viewer.js | 17 +++----- main/staticfiles/js/video_viewer.js | 51 ++++++++++++++++++++++++ main/templates/main/detail_crawler.html | 24 ++++++++++- 6 files changed, 143 insertions(+), 19 deletions(-) create mode 100644 main/staticfiles/js/video_viewer.js diff --git a/api/urls.py b/api/urls.py index 06d4b6d7..beeb922a 100644 --- a/api/urls.py +++ b/api/urls.py @@ -63,6 +63,7 @@ # instance debug path('instance//debug/trace', views.CrawlerInstanceViewSet.as_view({'get': 'export_trace'}), name='instance-debug-trace'), + path('instance//debug/video', views.CrawlerInstanceViewSet.as_view({'get': 'export_video'}), name='instance-debug-video'), path('instance//debug/screenshots', views.CrawlerInstanceViewSet.as_view({'get': 'screenshots'}), name='instance-debug-screenshots'), # task info diff --git a/api/views/crawler_instance.py b/api/views/crawler_instance.py index 3a84a903..2cb7e8f2 100644 --- a/api/views/crawler_instance.py +++ b/api/views/crawler_instance.py @@ -556,7 +556,7 @@ def screenshots(self, request, pk): instance = CrawlerInstance.objects.get(pk=pk) # get_object_or_404(CrawlerInstance, pk=instance_id) except: - return Response({'error': 'Instance not found.', + return Response({'error': 'Instância não encontrada.', 'total_screenshots': 0}, status=status.HTTP_404_NOT_FOUND) @@ -571,7 +571,7 @@ def screenshots(self, request, pk): if not os.path.isdir(screenshot_dir): return Response({ - 'error': 'Path of screenshots not found.', + 'error': 'O diretório de screenshots não foi encontrado.', 'total_screenshots': 0 }, status=status.HTTP_404_NOT_FOUND) @@ -580,7 +580,7 @@ def screenshots(self, request, pk): if total_screenshots == 0: return Response({ - 'error': 'None screenshots found.', + 'error': 'Nenhum screenshot encontrado.', 'total_screenshots': 0 }, status=status.HTTP_404_NOT_FOUND) @@ -621,8 +621,9 @@ def export_trace(self, request, pk): instance = CrawlerInstance.objects.get(pk=pk) # get_object_or_404(CrawlerInstance, pk=instance_id) except: - return Response(status=status.HTTP_404_NOT_FOUND) - + return Response({'error': 'Instância não encontrada.'}, + status=status.HTTP_404_NOT_FOUND) + data_path = instance.crawler.data_path file_name = f'{pk}.zip' @@ -633,11 +634,45 @@ def export_trace(self, request, pk): response = FileResponse(open(path, 'rb'), content_type='zip') except FileNotFoundError: - return Response({'error': 'Verifique se a opção de gerar arquivo trace foi habilitada na configuração do coletor'}, + return Response({'error': 'Verifique se a opção de gerar arquivo trace foi habilitada na configuração do coletor.'}, status=status.HTTP_404_NOT_FOUND) else: response['Content-Length'] = os.path.getsize(path) response['Content-Disposition'] = 'attachment; filename=%s' % file_name + return response + + @action(detail=True, methods=['get']) + def export_video(self, request, pk): + try: + instance = CrawlerInstance.objects.get(pk=pk) # get_object_or_404(CrawlerInstance, pk=instance_id) + + except: + return Response({'error': 'Instância não encontrada.'}, + status=status.HTTP_404_NOT_FOUND) + + data_path = instance.crawler.data_path + + video_path = os.path.join(OUTPUT_FOLDER, data_path, str(pk), 'debug', 'video') + + try: + files = [f for f in os.listdir(video_path) if f.endswith('.webm')] + if len(files) == 0: + raise FileNotFoundError + + except FileNotFoundError: + return Response({'error': 'Verifique se a opção de gerar vídeo foi habilitada na configuração do coletor.'}, + status=status.HTTP_404_NOT_FOUND) + + file_name = files[0] + file_path = os.path.join(video_path, file_name) + + response = FileResponse(open(file_path, 'rb'), + content_type='video/webm', + status=status.HTTP_200_OK) + + response['Content-Length'] = os.path.getsize(file_path) + response['Content-Disposition'] = 'attachment; filename=%s' % file_name + return response \ No newline at end of file diff --git a/main/staticfiles/js/details.js b/main/staticfiles/js/details.js index 0f1a7f0a..c227d6a6 100644 --- a/main/staticfiles/js/details.js +++ b/main/staticfiles/js/details.js @@ -186,7 +186,27 @@ function exit_crawler_queue(queue_item_id) { }); } +function downloadInstanceTrace(instance_id) { + let server_address = window.location.origin; + let url = `${server_address}/api/instance/${instance_id}/debug/trace`; + + // sends a head request to check if the file exists + $.ajax({ + url: url, + type: 'head', + dataType: 'json', + async: false, + success: function (data) { + window.open(url, '_blank'); + }, + error: function (data) { + alert('O arquivo de trace não existe!'); + } + }); +} + // Initiates all popovers on the page $(function () { $('[data-toggle="popover"]').popover() -}) \ No newline at end of file +}) + diff --git a/main/staticfiles/js/screenshot_viewer.js b/main/staticfiles/js/screenshot_viewer.js index 8c21ea49..c490e336 100644 --- a/main/staticfiles/js/screenshot_viewer.js +++ b/main/staticfiles/js/screenshot_viewer.js @@ -72,27 +72,22 @@ function displayScreenshotModal(instance_id) { return ''; } } - }).fail(function () { - $("#screenshot_modal .screenshot_list").text("Failed to load screenshots"); }); $("#screenshot_modal .spinner-border").hide(); - // $("#screenshot_modal").modal("show"); }, error: function (data) { data = data.responseJSON; - $("#screenshot_modal .screenshot_list").text(data['error']); + $("#screenshot_modal .screenshot_list").append(` + + `); $("#screenshot_modal .spinner-border").hide(); // $("#screenshot_modal").modal("hide"); } }); - - // $.ajax().done(function (data) { - // if ("error" in data) { - // return; - // } - // }); - } $('#screenshot_modal').on('hidden.bs.modal', function () { diff --git a/main/staticfiles/js/video_viewer.js b/main/staticfiles/js/video_viewer.js new file mode 100644 index 00000000..cda68907 --- /dev/null +++ b/main/staticfiles/js/video_viewer.js @@ -0,0 +1,51 @@ +function displayVideo(url) { + $("#video_player_modal .modal-body").append(` + + `); +} + +function displayVideoPlayerModal(instance_id) { + let server_address = window.location.origin; + let url = `${server_address}/api/instance/${instance_id}/debug/video`; + + $("#video_player_modal .modal-body").empty(); + + // sends a head request to check if the video exists + $.ajax({ + url: url, + type: 'head', + dataType: 'json', + success: function (data) { + // if the video exists, display it + displayVideo(url); + + // display the modal but do not allow the user to close it without stopping the video + $("#video_player_modal").modal({ + backdrop: 'static', + keyboard: false + }); + // $("#video_player_modal").modal("show"); + }, + error: function (data) { + // if the video does not exist, display an error message + $("#video_player_modal .modal-body").append(` + + `); + $("#video_player_modal").modal("show"); + } + }); +} + +function closeVideoPlayerModal() { + // stop the video from playing + if ($("#video_player").length > 0) + $("#video_player")[0].pause(); + + $("#video_player_modal .modal-body").empty(); + $("#video_player_modal").modal("hide"); +} \ No newline at end of file diff --git a/main/templates/main/detail_crawler.html b/main/templates/main/detail_crawler.html index c13296ba..511fe5ea 100644 --- a/main/templates/main/detail_crawler.html +++ b/main/templates/main/detail_crawler.html @@ -101,6 +101,7 @@

    Instances:

    + Video {% endif %} @@ -116,7 +117,8 @@

    Instances:

    Baixar {% if crawler.dynamic_processing %} - Baixar + + {% endif %} @@ -326,6 +328,25 @@
    + + + {% endif %} {% endblock %} @@ -349,5 +370,6 @@ + {% endblock %} From 2f4ae8c1f0a3d28571eccfa461ac91ff0a209f52 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Thu, 15 Jun 2023 11:26:13 -0300 Subject: [PATCH 81/89] Improves the crawler api documentation --- api/schemes/crawl_request.py | 0 api/views/crawler.py | 199 +++++++++++++++++++++++++---------- crawler_manager/settings.py | 6 +- 3 files changed, 145 insertions(+), 60 deletions(-) create mode 100644 api/schemes/crawl_request.py diff --git a/api/schemes/crawl_request.py b/api/schemes/crawl_request.py new file mode 100644 index 00000000..e69de29b diff --git a/api/views/crawler.py b/api/views/crawler.py index 04560bdb..924eb37a 100644 --- a/api/views/crawler.py +++ b/api/views/crawler.py @@ -12,7 +12,39 @@ from drf_yasg.utils import swagger_auto_schema from drf_yasg import openapi - + +# def get_model_schema(model): +# properties = {} +# fields = model._meta.fields + +# for field in fields: +# field_type = get_field_type(field) +# properties[field.name] = openapi.Schema(type=field_type) + +# return openapi.Schema( +# type=openapi.TYPE_OBJECT, +# properties=properties, +# ) + +# def get_field_type(field): +# if isinstance(field, models.CharField) or isinstance(field, models.TextField): +# return openapi.TYPE_STRING +# elif isinstance(field, models.IntegerField) or isinstance(field, models.AutoField): +# return openapi.TYPE_INTEGER +# elif isinstance(field, models.FloatField): +# return openapi.TYPE_NUMBER +# elif isinstance(field, models.BooleanField): +# return openapi.TYPE_BOOLEAN +# elif isinstance(field, models.DateField): +# return openapi.TYPE_STRING # You can customize the date format if needed +# elif isinstance(field, models.DateTimeField): +# return openapi.TYPE_STRING # You can customize the datetime format if needed +# else: +# return openapi.TYPE_STRING # Default to string if the field type is not recognized + + +# print(get_model_schema(CrawlRequest)) + class CrawlerViewSet(viewsets.ModelViewSet): """ ViewSet that allows crawlers to be viewed, edited, updated and removed. @@ -32,27 +64,6 @@ def _create_templated_url_response_handlers(self, response_handlers, crawler_id) handler['injection_type'] = 'templated_url' ResponseHandler.objects.create(**handler) - @swagger_auto_schema( - operation_summary='Retorna a lista de coletores.', - operation_description='Ao chamar por esse endpoint, uma lista de coletores será retornada em formato JSON.', - responses={ - 200: openapi.Response( - description='Lista de coletores.', - schema=openapi.Schema( - type=openapi.TYPE_ARRAY, - items=openapi.Schema( - type=openapi.TYPE_OBJECT, - properties={ - 'id': openapi.Schema( - type=openapi.TYPE_INTEGER, - description='ID único do coletor.' - ) - } - ) - ) - ) - } - ) def list(self, request, *args, **kwargs): return super().list(request, *args, **kwargs) @@ -60,14 +71,14 @@ def list(self, request, *args, **kwargs): operation_summary='Retorna um coletor.', operation_description='Ao chamar por esse endpoint, um coletor será retornado em formato JSON.', responses={ - 200: openapi.Response( - description='Coletor.', + 404: openapi.Response( + description='Coletor não encontrado.', schema=openapi.Schema( type=openapi.TYPE_OBJECT, properties={ - 'id': openapi.Schema( - type=openapi.TYPE_INTEGER, - description='ID único do coletor.' + 'error': openapi.Schema( + type=openapi.TYPE_STRING, + description='Mensagem de erro.' ) } ) @@ -89,22 +100,7 @@ def retrieve(self, request, *args, **kwargs): @swagger_auto_schema( operation_summary='Atualiza um coletor.', operation_description='Ao chamar por esse endpoint, um coletor será atualizado e retornado em formato JSON.', - request_body=openapi.Schema( - type=openapi.TYPE_OBJECT, - ), responses={ - 200: openapi.Response( - description='Coletor atualizado com sucesso.', - schema=openapi.Schema( - type=openapi.TYPE_OBJECT, - properties={ - 'id': openapi.Schema( - type=openapi.TYPE_INTEGER, - description='ID único do coletor.' - ) - } - ) - ), 400: openapi.Response( description='Erro na requisição.', schema=openapi.Schema( @@ -134,11 +130,6 @@ def update(self, request, *args, **kwargs): @swagger_auto_schema( operation_summary='Remove um coletor.', operation_description='Ao chamar por esse endpoint, um coletor será removido.', - responses={ - 204: openapi.Response( - description='Coletor removido com sucesso.' - ) - }, manual_parameters=[ openapi.Parameter( name='id', @@ -155,9 +146,6 @@ def destroy(self, request, *args, **kwargs): @swagger_auto_schema( operation_summary='Cria um novo coletor.', operation_description='Ao chamar por esse endpoint, um novo coletor será criado e retornado em formato JSON.', - request_body=openapi.Schema( - type=openapi.TYPE_OBJECT, - ), responses={ 201: openapi.Response( description='Coletor criado com sucesso.', @@ -214,6 +202,37 @@ def create(self, request, *args, **kwargs): @swagger_auto_schema( operation_summary='Executa o coletor.', operation_description='Ao chamar por esse endpoint, o coletor irá para a fila de coletas. A fila em que aguardará depende de seu parâmetro `expected_runtime_category`.', + responses={ + 200: openapi.Response( + description='Coletor foi colocado para execução.', + schema=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + 'instance_id': openapi.Schema( + type=openapi.TYPE_INTEGER, + description='ID único da instância do coletor, retornado apenas se o parâmetro `action` for `run_immediately`.' + \ + 'Caso contrário, o coletor será colocado na fila de execução e o ID da instância será null.' + ), + 'message': openapi.Schema( + type=openapi.TYPE_STRING, + description='Mensagem informando em qual posição da fila o coletor foi adicionado ou se executou imediatamente.' + ) + } + ) + ), + 404: openapi.Response( + description='Coletor não encontrado.', + schema=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + 'error': openapi.Schema( + type=openapi.TYPE_STRING, + description='Mensagem de erro.' + ) + } + ) + ) + }, manual_parameters=[ openapi.Parameter( 'id', @@ -243,7 +262,9 @@ def run(self, request, pk): add_crawl_request(pk, wait_on) instance = process_run_crawl(pk) - return Response({'instance_id': instance.instance_id}, status=status.HTTP_201_CREATED) + return Response({'instance_id': instance.instance_id, + 'message': 'Crawler foi colocado para execução sem espera na fila de coletas.'}, + status=status.HTTP_200_OK) elif action == 'wait_on_first_queue_position': wait_on = 'first_position' @@ -263,12 +284,12 @@ def run(self, request, pk): return Response({'error': str(e)}, status=status.HTTP_400_BAD_REQUEST) if wait_on == 'first_position': - message = f'Crawler added to crawler queue in first position' + message = 'Crawler adicionado a fila de coletas na primeira posição' else: - message = f'Crawler added to crawler queue in last position' + message = 'Crawler adicionado a fila de coletas na última posição.' - return Response({'message': message}, status=status.HTTP_200_OK) + return Response({'message': message, 'instance_id': None}, status=status.HTTP_200_OK) @swagger_auto_schema( operation_summary='Interrompe o coletor.', @@ -280,7 +301,24 @@ def run(self, request, pk): description='ID único do crawler.', type=openapi.TYPE_INTEGER ), - ] + ], + responses={ + 204: openapi.Response( + description='O processo de interrupção do coletor foi iniciado.' + ), + 404: openapi.Response( + description='Coletor não encontrado.', + schema=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + 'error': openapi.Schema( + type=openapi.TYPE_STRING, + description='Mensagem de erro.' + ) + } + ) + ) + } ) @action(detail=True, methods=['get']) def stop(self, request, pk): @@ -294,7 +332,7 @@ def stop(self, request, pk): @swagger_auto_schema( operation_summary='Cria agrupamentos baseado em determinado coletor.', - operation_description='Retorna um grupo é de coletores dinâmicos que possuem os mesmos passos que o coletor de `id` passado como parâmetro.', + operation_description='Retorna um grupo de coletores dinâmicos que possuem os mesmos passos que o coletor de `id` passado como parâmetro.', manual_parameters=[ openapi.Parameter( 'id', @@ -302,10 +340,57 @@ def stop(self, request, pk): description='ID único do crawler.', type=openapi.TYPE_INTEGER ), - ] + ], + responses={ + 200: openapi.Response( + description='Lista de coletores.', + schema=openapi.Schema( + type=openapi.TYPE_ARRAY, + items=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + 'id': openapi.Schema( + type=openapi.TYPE_INTEGER, + description='ID único do coletor.' + ), + 'source_name': openapi.Schema( + type=openapi.TYPE_STRING, + description='Nome do coletor.' + ), + 'last_modified': openapi.Schema( + type=openapi.TYPE_STRING, + description='Data da última modificação do coletor.' + ), + 'base_url': openapi.Schema( + type=openapi.TYPE_STRING, + description='URL base do coletor.' + ), + } + ) + ) + ), + 404: openapi.Response( + description='Coletor não encontrado.', + schema=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + 'error': openapi.Schema( + type=openapi.TYPE_STRING, + description='Mensagem de erro.' + ) + } + ) + ) + } ) @action(detail=True, methods=['get']) def group(self, request, pk): + try: + CrawlRequest.objects.get(pk=pk) + + except CrawlRequest.DoesNotExist: + return Response({'error': 'Coletor não encontrado.'}, status=status.HTTP_404_NOT_FOUND) + crawlers = CrawlRequest.objects.raw( "select id, source_name \ from main_crawlrequest \ diff --git a/crawler_manager/settings.py b/crawler_manager/settings.py index 6a07b7a3..d25071ee 100644 --- a/crawler_manager/settings.py +++ b/crawler_manager/settings.py @@ -3,7 +3,7 @@ # Kafka host information KAFKA_TOPIC_PREFIX = os.getenv('KAFKA_TOPIC_PREFIX', 'crawler_ufmg') -KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'kafka:9092').split(',')] +KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'localhost:9092').split(',')] KAFKA_CONSUMER_AUTO_OFFSET_RESET = 'earliest' KAFKA_CONSUMER_TIMEOUT = 120000 KAFKA_CONSUMER_COMMIT_INTERVAL_MS = 5000 @@ -16,7 +16,7 @@ KAFKA_SESSION_TIMEOUT_MS = 2 * 60 * 1000 # Redis host information -REDIS_HOST = os.getenv('REDIS_HOST', 'redis') +REDIS_HOST = os.getenv('REDIS_HOST', 'localhost') REDIS_PORT = int(os.getenv('REDIS_PORT', 6379)) REDIS_DB = int(os.getenv('REDIS_DB', 0)) REDIS_PASSWORD = os.getenv('REDIS_PASSWORD', None) @@ -39,7 +39,7 @@ WRITER_TOPIC = os.getenv('WRITER_TOPIC', KAFKA_TOPIC_PREFIX + '.writer') STOPPED_SPIDER_NOTIFICATION_ADDRESS = os.getenv( - 'STOPPED_SPIDER_NOTIFICATION_ADDRESS', 'http://web:8000/detail/stop_crawl/{crawler_id}') + 'STOPPED_SPIDER_NOTIFICATION_ADDRESS', 'http://localhost:8000/detail/stop_crawl/{crawler_id}') TASK_TOPIC = os.getenv('TASK_TOPIC', KAFKA_TOPIC_PREFIX + 'task_topic') TASK_DATA_CONSUMER_GROUP = os.getenv('TASK_DATA_CONSUMER_DATA', KAFKA_TOPIC_PREFIX + '.task_data_group') From 766b66d656ea6743d50ca58c73bd660511b07a94 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Thu, 15 Jun 2023 12:09:16 -0300 Subject: [PATCH 82/89] Improves crawler queue documentation --- api/urls.py | 2 +- api/views/crawler_queue.py | 222 +++++++++++++++++++++++++------------ 2 files changed, 152 insertions(+), 72 deletions(-) diff --git a/api/urls.py b/api/urls.py index beeb922a..57a85858 100644 --- a/api/urls.py +++ b/api/urls.py @@ -73,7 +73,7 @@ # queue info path('queue/', views.CrawlerQueueViewSet.as_view({'get': 'retrieve', 'put': 'update'}), name='queue'), - path('queue/switch_position//', views.CrawlerQueueViewSet.as_view({'get': 'switch_position'}), name='queue-switch-position'), + path('queue/switch_position//', views.CrawlerQueueViewSet.as_view({'get': 'switch_position'}), name='queue-switch-position'), path('queue/force_execution/', views.CrawlerQueueViewSet.as_view({'get': 'force_execution'}), name='queue-force-execution'), path('queue/remove_item/', views.CrawlerQueueViewSet.as_view({'get': 'remove_item'}), name='queue-remove-item'), diff --git a/api/views/crawler_queue.py b/api/views/crawler_queue.py index 1cd5dca1..2c9f3ba9 100644 --- a/api/views/crawler_queue.py +++ b/api/views/crawler_queue.py @@ -18,27 +18,73 @@ class CrawlerQueueViewSet(viewsets.ModelViewSet): http_method_names = ['get', 'put'] @swagger_auto_schema( - operation_summary='Retorna a fila de execução', - operation_description='Retorna a fila de execução', + operation_summary='Retorna a fila de execução.', + operation_description='Retorna os itens da fila de execução, incluindo o tamanho máximo de cada uma das 3.', responses={ 200: openapi.Response( - description='Fila de execução retornada com sucesso', - examples={ - 'application/json': { - 'success': True, - 'data': { - 'queue': [ - { - 'id': 1, - 'url': 'http://www.google.com', - 'status': 'pending', - 'created_at': '2020-01-01T00:00:00Z', - 'updated_at': '2020-01-01T00:00:00Z' + description='Retorna os itens da fila de execução e o número máximo de itens executando simultaneamente nelas.', + schema=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + 'max_fast_runtime_crawlers_running': openapi.Schema( + type=openapi.TYPE_INTEGER, + description='Número máximo de crawlers rápidos executando simultaneamente.' + ), + 'max_slow_runtime_crawlers_running': openapi.Schema( + type=openapi.TYPE_INTEGER, + description='Número máximo de crawlers lentos executando simultaneamente.' + ), + 'max_medium_runtime_crawlers_running': openapi.Schema( + type=openapi.TYPE_INTEGER, + description='Número máximo de crawlers de temmpo de execução médio executando simultaneamente.' + ), + 'items': openapi.Schema( + type=openapi.TYPE_ARRAY, + items=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + 'id': openapi.Schema( + type=openapi.TYPE_INTEGER, + description='ID do item da fila de execução.' + ), + 'creation_date': openapi.Schema( + type=openapi.TYPE_INTEGER, + description='Timestamp da data de criação do item da fila de execução.' + ), + 'last_modified': openapi.Schema( + type=openapi.TYPE_INTEGER, + description='Timestamp da última modificação do item da fila de execução.' + ), + 'crawler_id': openapi.Schema( + type=openapi.TYPE_INTEGER, + description='ID do crawler associado ao item da fila de execução.' + ), + 'crawler_name': openapi.Schema( + type=openapi.TYPE_STRING, + description='Nome do crawler associado ao item da fila de execução.' + ), + 'queue_type': openapi.Schema( + type=openapi.TYPE_STRING, + description='Tipo da fila de execução do item.', + enum=['fast', 'medium', 'slow'] + ), + 'position': openapi.Schema( + type=openapi.TYPE_INTEGER, + description='Posição do item em sua respectiva fila de execução.' + ), + 'forced_execution': openapi.Schema( + type=openapi.TYPE_BOOLEAN, + description='Indica se o item foi executado imediatamente.' + ), + 'running': openapi.Schema( + type=openapi.TYPE_BOOLEAN, + description='Indica se o item está sendo executado no momento.' + ), } - ] - } - } - } + ) + ) + } + ) ) } ) @@ -51,15 +97,15 @@ def retrieve(self, request): operation_description='Troca a posição do item A com o item B na fila de execução', manual_parameters=[ openapi.Parameter( - name='a', - in_=openapi.IN_QUERY, + name='item_a', + in_=openapi.IN_PATH, description='ID do item A', required=True, type=openapi.TYPE_INTEGER ), openapi.Parameter( - name='b', - in_=openapi.IN_QUERY, + name='item_b', + in_=openapi.IN_PATH, description='ID do item B', required=True, type=openapi.TYPE_INTEGER @@ -67,45 +113,48 @@ def retrieve(self, request): ], responses={ 200: openapi.Response( - description='Posições trocadas com sucesso', - examples={ - 'application/json': { - 'success': True - } - } + description='Posições dos itens trocadas com sucesso.', ), 400: openapi.Response( - description='Os itens devem estar na mesma fila', - examples={ - 'application/json': { - 'error': 'Crawler queue items must be in same queue!' + description='Os itens devem estar na mesma fila.', + schema=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + 'error': openapi.Schema( + type=openapi.TYPE_STRING, + description='Mensagem de erro.' + ) } - } + ), ), 404: openapi.Response( - description='Item A ou B não encontrado', - examples={ - 'application/json': { - 'error': 'Crawler queue item 1 not found!' + description='Item A e/ou B não encontrado.', + schema=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + 'error': openapi.Schema( + type=openapi.TYPE_STRING, + description='Mensagem de erro.' + ) } - } + ) ) } ) @action(detail=False, methods=['get']) - def switch_position(self, request, a: int, b: int): + def switch_position(self, request, item_a: int, item_b: int): with transaction.atomic(): try: - queue_item_a = CrawlerQueueItem.objects.get(pk=a) + queue_item_a = CrawlerQueueItem.objects.get(pk=item_a) except ObjectDoesNotExist: - return Response({'error': f'Crawler queue item {a} not found!'}, status=status.HTTP_404_NOT_FOUND) + return Response({'error': f'Crawler queue item {item_a} not found!'}, status=status.HTTP_404_NOT_FOUND) try: - queue_item_b = CrawlerQueueItem.objects.get(pk=b) + queue_item_b = CrawlerQueueItem.objects.get(pk=item_b) except ObjectDoesNotExist: - return Response({'error': f'Crawler queue item {b} not found!'}, status=status.HTTP_404_NOT_FOUND) + return Response({'error': f'Crawler queue item {item_b} not found!'}, status=status.HTTP_404_NOT_FOUND) if queue_item_a.queue_type != queue_item_b.queue_type: return Response({'error': 'Crawler queue items must be in same queue!'}, status=status.HTTP_400_BAD_REQUEST) @@ -121,34 +170,45 @@ def switch_position(self, request, a: int, b: int): return Response(status=status.HTTP_200_OK) @swagger_auto_schema( - operation_summary='Executa um crawler imediatamente', - operation_description='Executa um crawler imediatamente, ignorando a fila de execução', + operation_summary='Executa um crawler imediatamente.', + operation_description='Executa um crawler imediatamente, ignorando a fila de execução.', manual_parameters=[ openapi.Parameter( name='item_id', - in_=openapi.IN_QUERY, - description='ID do item da fila de execução', + in_=openapi.IN_PATH, + description='ID do item da fila de execução.', required=True, type=openapi.TYPE_INTEGER ) ], responses={ 200: openapi.Response( - description='Crawler executado com sucesso', - examples={ - 'application/json': { - 'crawler_id': 1, - 'instance_id': 1 + description='Crawler executado com sucesso.', + schema=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + 'crawler_id': openapi.Schema( + type=openapi.TYPE_INTEGER, + description='ID do crawler executado.' + ), + 'instance_id': openapi.Schema( + type=openapi.TYPE_INTEGER, + description='ID da instância do crawler executado.' + ) } - } + ) ), 404: openapi.Response( - description='Item da fila de execução não encontrado', - examples={ - 'application/json': { - 'error': 'Crawler queue item 1 not found!' + description='Item da fila de execução não encontrado.', + schema=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + 'error': openapi.Schema( + type=openapi.TYPE_STRING, + description='Mensagem de erro' + ) } - } + ), ) } ) @@ -159,7 +219,7 @@ def force_execution(self, request, item_id: int): queue_item = CrawlerQueueItem.objects.get(pk=item_id) except ObjectDoesNotExist: - return Response({'error': f'Crawler queue item {item_id} not found!'}, status=status.HTTP_404_NOT_FOUND) + return Response({'error': f'Item não existe na fila de coletas!'}, status=status.HTTP_404_NOT_FOUND) crawler_id = queue_item.crawl_request.id @@ -182,7 +242,7 @@ def force_execution(self, request, item_id: int): manual_parameters=[ openapi.Parameter( name='item_id', - in_=openapi.IN_QUERY, + in_=openapi.IN_PATH, description='ID do item da fila de execução', required=True, type=openapi.TYPE_INTEGER @@ -190,15 +250,19 @@ def force_execution(self, request, item_id: int): ], responses={ 204: openapi.Response( - description='Item removido com sucesso' + description='Item removido com sucesso.' ), 404: openapi.Response( - description='Item da fila de execução não encontrado', - examples={ - 'application/json': { - 'error': 'Crawler queue item 1 not found!' + description='Item da fila de execução não encontrado.', + schema=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + 'error': openapi.Schema( + type=openapi.TYPE_STRING, + description='Mensagem de erro' + ) } - } + ), ) } ) @@ -209,7 +273,7 @@ def remove_item(self, request, item_id: int): queue_item.delete() except ObjectDoesNotExist: - return Response({'error': f'Crawler queue item {item_id} not found!'}, status=status.HTTP_404_NOT_FOUND) + return Response({'error': f'Ttem {item_id} não está na fila!'}, status=status.HTTP_404_NOT_FOUND) return Response(status=status.HTTP_204_NO_CONTENT) @@ -244,13 +308,29 @@ def get_object(self): ), responses={ 200: openapi.Response( - description='Configurações atualizadas com sucesso' + description='Configurações atualizadas com sucesso.' + ), + 400: openapi.Response( + description='Erro ao atualizar as configurações', + schema=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + 'error': openapi.Schema( + type=openapi.TYPE_STRING, + description='Mensagem de erro' + ) + } + ) ) } ) def update(self, request): - response = super().update(request) - + try: + super().update(request) + + except Exception as e: + return Response({'error': e.message}, status=status.HTTP_400_BAD_REQUEST) + # updade crawler queue instance with new configs global CRAWLER_QUEUE CRAWLER_QUEUE = CrawlerQueue.object() @@ -270,4 +350,4 @@ def update(self, request): if 'max_slow_runtime_crawlers_running' in request.data: unqueue_crawl_requests('slow') - return response \ No newline at end of file + return Response(status=status.HTTP_200_OK) \ No newline at end of file From f914458a871272b574466de909e31b0c8d233fff Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Tue, 20 Jun 2023 12:02:08 -0300 Subject: [PATCH 83/89] Fix integration errors after api updates --- api/views/crawler_queue.py | 29 ++++++------------ main/models.py | 27 ++++++++++++----- main/staticfiles/js/crawler_queue.js | 30 ++++++++----------- main/staticfiles/js/details.js | 19 ++++++++++++ main/templates/main/detail_crawler.html | 2 +- main/utils.py | 25 ++++++++++++---- .../crawling_utils/crawling_utils.py | 15 ++++------ 7 files changed, 87 insertions(+), 60 deletions(-) diff --git a/api/views/crawler_queue.py b/api/views/crawler_queue.py index 2c9f3ba9..39cb059a 100644 --- a/api/views/crawler_queue.py +++ b/api/views/crawler_queue.py @@ -329,25 +329,14 @@ def update(self, request): super().update(request) except Exception as e: - return Response({'error': e.message}, status=status.HTTP_400_BAD_REQUEST) + return Response({'error': request.data}, status=status.HTTP_400_BAD_REQUEST) - # updade crawler queue instance with new configs - global CRAWLER_QUEUE - CRAWLER_QUEUE = CrawlerQueue.object() - - # the size of queue of type fast changed, may is possible run - # more crawlers - if 'max_fast_runtime_crawlers_running' in request.data: - unqueue_crawl_requests('fast') - - # the size of queue of type normal changed, may is possible run - # more crawlers - if 'max_medium_runtime_crawlers_running' in request.data: - unqueue_crawl_requests('medium') - - # the size of queue of type slow changed, may is possible run - # more crawlers - if 'max_slow_runtime_crawlers_running' in request.data: - unqueue_crawl_requests('slow') + try: + unqueue_crawl_requests('fast', True) + unqueue_crawl_requests('medium', True) + unqueue_crawl_requests('slow', True) - return Response(status=status.HTTP_200_OK) \ No newline at end of file + return Response(status=status.HTTP_200_OK) + + except Exception as e: + return Response({'error': e.message}, status=status.HTTP_500_INTERNAL_SERVER_ERROR) \ No newline at end of file diff --git a/main/models.py b/main/models.py index f5bce38e..1525238b 100644 --- a/main/models.py +++ b/main/models.py @@ -704,12 +704,13 @@ class TaskType(TypedDict): repeat_mode: Literal['no_repeat', 'daily', 'weekly', 'monthly', 'yearly', 'personalized'] personalized_repetition_mode: Union[None, PersonalizedRepetionMode] - class Task(TimeStamped): - crawl_request = models.ForeignKey(CrawlRequest, on_delete=models.CASCADE, related_name='scheduler_jobs') + crawl_request = models.ForeignKey(CrawlRequest, on_delete=models.CASCADE, related_name='scheduler_jobs', + help_text='Coletor que será agendado para execução.') # data e horário base para começar o agendamento de coletas - runtime = models.DateTimeField() + runtime = models.DateTimeField(help_text='Data e horário base para começar o agendamento de coletas.' + \ + 'Após o primeiro agendamento, o próximo será calculado de acordo com o intervalo de repetição e o horário definido nesse atributo.') CRAWLER_QUEUE_BEHAVIOR_CHOICES = [ ('wait_on_last_queue_position', 'Esperar na última posição da fila'), @@ -719,8 +720,11 @@ class Task(TimeStamped): ] # O que o agendador deve fazer com o coletor ao inserí-lo na fila de coletas. - crawler_queue_behavior = models.CharField( - max_length=32, choices=CRAWLER_QUEUE_BEHAVIOR_CHOICES, default='wait_on_last_queue_position') + crawler_queue_behavior = models.CharField(max_length=32, choices=CRAWLER_QUEUE_BEHAVIOR_CHOICES, + help_text='Define o que o agendador deve fazer com o coletor ao inserí-lo na fila de coletas, se irá executar' +\ + ' imediatamente (`run_immediately`), esperar na primeira (`wait_on_first_queue_position`) ou última posição ' +\ + '(`wait_on_last_queue_position`) de sua fila de coletas.', + default='wait_on_last_queue_position') REPETITION_MODE_CHOICES = [ ('no_repeat', 'Não se repete'), @@ -732,7 +736,16 @@ class Task(TimeStamped): ] # modo de repetição da coleta agendada. - repeat_mode = models.CharField(max_length=32, choices=REPETITION_MODE_CHOICES, default='no_repeat') + repeat_mode = models.CharField(max_length=32, choices=REPETITION_MODE_CHOICES, default='no_repeat', + help_text=''' + Define o tipo de repetição da coleta agendada. Pode ser: + - `no_repeat`: Não se repete. + - `daily`: Diariamente, na hora definida em `runtime`. + - `weekly`: Semanalmente, na mesma hora e dia da semana de sua primeira execução, definida em `runtime`. + - `monthly`: Mensalmente, na mesma hora e dia do mês de sua primeira execução, definida em `runtime`. Caso o mês não tenha o dia definido em `runtime`, a coleta ocorrerá no último dia do mês. + - `yearly`: Anualmente, na mesma hora e dia do ano de sua primeira execução, definida em `runtime`. Caso o ano não tenha o dia definido em `runtime`, a coleta ocorrerá no último dia do respectivo mês. + - `personalized`: Personalizado, de acordo com a configuração definida em `personalized_repetition_mode`. + ''') # json com a configuração personalizada de reexecução do coletor - personalized_repetition_mode: PersonalizedRepetionMode = models.JSONField(null=True, blank=True) + personalized_repetition_mode: PersonalizedRepetionMode = models.JSONField(null=True, blank=True,) \ No newline at end of file diff --git a/main/staticfiles/js/crawler_queue.js b/main/staticfiles/js/crawler_queue.js index 4aba50e5..8d1f70ff 100644 --- a/main/staticfiles/js/crawler_queue.js +++ b/main/staticfiles/js/crawler_queue.js @@ -1,7 +1,7 @@ var SERVER_ADDRESS = window.location.origin; // the queue always has id = 1 as it is unique and designed that way -var CRAWLER_QUEUE_API_ADDRESS = SERVER_ADDRESS + '/api/crawler_queue/1/'; +var CRAWLER_QUEUE_API_ADDRESS = SERVER_ADDRESS + '/api/queue/'; // when this variable equals true, it blocks the interface update to prevent // the data being changed from being rewritten by the interface update @@ -321,6 +321,8 @@ function update_scheduler_config(data) { } function update_ui() { + console.log('Atualizando Interface...'); + if (UPDATING_SCHEDULER_CONFIG) return; @@ -372,15 +374,11 @@ function updateMaxCrawlers() { $.ajax({ url: CRAWLER_QUEUE_API_ADDRESS, type: 'put', - contentType: "application/json; charset=utf-8", - dataType: "json", async: false, data: data, success: function (data) { UPDATING_SCHEDULER_CONFIG = false; - }, - error: function (data) { - alert('Houve um erro ao editar o campo!'); + console.log('Configurações atualizadas com sucesso!', data); } }); } @@ -429,25 +427,23 @@ function filter_waiting_crawlers(filter) { function switch_position(a, b) { UPDATING_SCHEDULER_CONFIG = true; - let switch_position_address = CRAWLER_QUEUE_API_ADDRESS + `switch_position/?a=${a}&b=${b}`; + let switch_position_address = CRAWLER_QUEUE_API_ADDRESS + `switch_position/${a}/${b}`; $.ajax({ url: switch_position_address, type: 'get', dataType: 'json', async: false, - success: function (data) { - UPDATING_SCHEDULER_CONFIG = false; - update_ui(); - }, - error: function (data) { - alert('Houve um erro ao trocar posições na fila!'); - } }); + + console.log('switched'); + + UPDATING_SCHEDULER_CONFIG = false; + update_ui(); } function forceExecution() { - let force_exec_address = CRAWLER_QUEUE_API_ADDRESS + `force_execution?queue_item_id=${QUEUE_ITEM_TO_FORCE_EXEC}`; + let force_exec_address = CRAWLER_QUEUE_API_ADDRESS + `force_execution/${QUEUE_ITEM_TO_FORCE_EXEC}`; $.ajax({ url: force_exec_address, @@ -466,7 +462,7 @@ function forceExecution() { } function remove_item_from_queue(queue_item_id) { - let remove_queue_item_address = CRAWLER_QUEUE_API_ADDRESS + `remove_item?queue_item_id=${queue_item_id}`; + let remove_queue_item_address = CRAWLER_QUEUE_API_ADDRESS + `remove_item/${queue_item_id}`; UPDATING_SCHEDULER_CONFIG = true; $.ajax({ @@ -561,7 +557,7 @@ function get_default_active_tab() { } function stop_running_crawler(crawler_id) { - let stop_crawler_address = SERVER_ADDRESS + `/api/crawlers/${crawler_id}/stop`; + let stop_crawler_address = SERVER_ADDRESS + `/api/crawler/${crawler_id}/stop`; $.ajax({ url: stop_crawler_address, type: 'get', diff --git a/main/staticfiles/js/details.js b/main/staticfiles/js/details.js index c227d6a6..a015cdad 100644 --- a/main/staticfiles/js/details.js +++ b/main/staticfiles/js/details.js @@ -205,6 +205,25 @@ function downloadInstanceTrace(instance_id) { }); } +function downloadConfig(instance_id) { + let server_address = window.location.origin; + let url = `${server_address}/api/instance/${instance_id}/config`; + + // sends a head request to check if the file exists + $.ajax({ + url: url, + type: 'head', + dataType: 'json', + async: false, + success: function (data) { + window.open(url, '_blank'); + }, + error: function (data) { + alert('O arquivo de configuração não existe!'); + } + }); +} + // Initiates all popovers on the page $(function () { $('[data-toggle="popover"]').popover() diff --git a/main/templates/main/detail_crawler.html b/main/templates/main/detail_crawler.html index 511fe5ea..92aa6008 100644 --- a/main/templates/main/detail_crawler.html +++ b/main/templates/main/detail_crawler.html @@ -114,7 +114,7 @@

    Instances:

    {{instance.duration_readable}} {{instance.num_data_files}} arquivos {{instance.data_size_readable}} - Baixar + {% if crawler.dynamic_processing %} diff --git a/main/utils.py b/main/utils.py index 8504e524..53781edd 100644 --- a/main/utils.py +++ b/main/utils.py @@ -1,8 +1,13 @@ +import os +import subprocess + from typing_extensions import Literal from django.db import transaction from django.utils import timezone import crawler_manager.crawler_manager as crawler_manager +from crawler_manager.settings import OUTPUT_FOLDER + from main.models import CrawlRequest, CrawlerInstance, CrawlerQueue, CrawlerQueueItem from main.forms import ParameterHandlerFormSet, ResponseHandlerFormSet @@ -17,6 +22,10 @@ except: pass +def update_queue_instance(): + global CRAWLER_QUEUE + CRAWLER_QUEUE = CrawlerQueue.object() + def create_instance(crawler_id, instance_id): mother = CrawlRequest.objects.filter(id=crawler_id) obj = CrawlerInstance.objects.create( @@ -64,7 +73,10 @@ def add_crawl_request(crawler_id, wait_on: Literal['last_position', 'first_posit queue_type=cr_expec_runtime_cat) queue_item.save() -def unqueue_crawl_requests(queue_type: str): +def unqueue_crawl_requests(queue_type: str, update_queue: bool = False): + if update_queue: + update_queue_instance() + crawlers_runnings = list() has_items_from_another_queue, queue_items = CRAWLER_QUEUE.get_next(queue_type) @@ -108,15 +120,16 @@ def process_stop_crawl(crawler_id, from_sm_listener: bool = False): # FIXME: Colocar esse trecho de código no módulo writer # computa o tamanho em kbytes do diretório "data" - # command_output = subprocess.run(["du " + config['data_path'] + "/data -d 0"], shell=True, stdout=subprocess.PIPE) - # output_line = command_output.stdout.decode('utf-8').strip('\n') - # parts = output_line.split('\t') - data_size_kbytes = 0 # int(parts[0]) + instance_path = os.path.join(OUTPUT_FOLDER, config['data_path'], str(instance_id), 'data') + + command_output = subprocess.run(["du " + instance_path + " -d 0"], shell=True, stdout=subprocess.PIPE) + output_line = command_output.stdout.decode('utf-8').strip('\n') + parts = output_line.split('\t') + data_size_kbytes = int(parts[0]) # Get the number of files downloaded from the instance object num_data_files = instance.number_files_success_download - instance = None instance_info = {} queue_type = None diff --git a/src/crawling_utils/crawling_utils/crawling_utils.py b/src/crawling_utils/crawling_utils/crawling_utils.py index 792cdf00..9d5eb161 100644 --- a/src/crawling_utils/crawling_utils/crawling_utils.py +++ b/src/crawling_utils/crawling_utils/crawling_utils.py @@ -10,11 +10,11 @@ SERVER_ADDRESS = os.getenv('SERVER_ADDRESS', 'http://localhost:8000') -SERVER_NEW_PAGE_FOUND_API = SERVER_ADDRESS + '/download/pages/found/{instance_id}/{num_pages}' -SERVER_PAGE_CRAWLED_API = SERVER_ADDRESS + '/download/page/{message}/{instance_id}' +SERVER_NEW_PAGE_FOUND_API = SERVER_ADDRESS + '/api/instance/{instance_id}/page/found/{num_pages}' +SERVER_PAGE_CRAWLED_API = SERVER_ADDRESS + '/api/instance/{instance_id}/page/{message}' -SERVER_FILES_FOUND_API = SERVER_ADDRESS + '/download/files/found/{instance_id}/{num_files}' -SERVER_FILE_DOWNLOADED_API = SERVER_ADDRESS + '/download/file/{message}/{instance_id}' +SERVER_FILES_FOUND_API = SERVER_ADDRESS + '/api/instance/{instance_id}/file/found/{num_files}' +SERVER_FILE_DOWNLOADED_API = SERVER_ADDRESS + '/api/instance/{instance_id}/file/{message}' SERVER_SESSION = requests.sessions.Session() @@ -67,7 +67,6 @@ def notify_new_page_found(instance_id: str, num_pages: int = 1): instance_id=instance_id, num_pages=num_pages) notify_server(server_notification_url) - def notify_page_crawled_successfully(instance_id: str): server_notification_url = SERVER_PAGE_CRAWLED_API.format( message='success', instance_id=instance_id) @@ -75,7 +74,7 @@ def notify_page_crawled_successfully(instance_id: str): def notify_page_previously_crawled(instance_id: str): server_notification_url = SERVER_PAGE_CRAWLED_API.format( - message='previously_crawled', instance_id=instance_id) + message='previously', instance_id=instance_id) notify_server(server_notification_url) def notify_page_crawled_with_error(instance_id: str): @@ -89,13 +88,11 @@ def notify_page_duplicated_found(instance_id: str): message='duplicated', instance_id=instance_id) notify_server(server_notification_url) - def notify_files_found(instance_id: str, num_files: int): server_notification_url = SERVER_FILES_FOUND_API.format( instance_id=instance_id, num_files=num_files) notify_server(server_notification_url) - def notify_file_downloaded_successfully(instance_id: str): server_notification_url = SERVER_FILE_DOWNLOADED_API.format( message='success', instance_id=instance_id) @@ -103,7 +100,7 @@ def notify_file_downloaded_successfully(instance_id: str): def notify_file_previously_crawled(instance_id: str): server_notification_url = SERVER_FILE_DOWNLOADED_API.format( - message='previously_crawled', instance_id=instance_id) + message='previously', instance_id=instance_id) notify_server(server_notification_url) def notify_file_downloaded_with_error(instance_id: str): From ca278819d103193ebb3257a2b8ec5899750b373d Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Tue, 20 Jun 2023 12:24:10 -0300 Subject: [PATCH 84/89] Updates interface integration with the apu --- api/views/crawler.py | 32 --------------------- api/views/task.py | 31 ++++++++++++++++---- crawler_manager/settings.py | 2 +- main/staticfiles/js/crawler_list_grouped.js | 9 ++++-- 4 files changed, 33 insertions(+), 41 deletions(-) diff --git a/api/views/crawler.py b/api/views/crawler.py index 924eb37a..fd8360b8 100644 --- a/api/views/crawler.py +++ b/api/views/crawler.py @@ -13,38 +13,6 @@ from drf_yasg.utils import swagger_auto_schema from drf_yasg import openapi -# def get_model_schema(model): -# properties = {} -# fields = model._meta.fields - -# for field in fields: -# field_type = get_field_type(field) -# properties[field.name] = openapi.Schema(type=field_type) - -# return openapi.Schema( -# type=openapi.TYPE_OBJECT, -# properties=properties, -# ) - -# def get_field_type(field): -# if isinstance(field, models.CharField) or isinstance(field, models.TextField): -# return openapi.TYPE_STRING -# elif isinstance(field, models.IntegerField) or isinstance(field, models.AutoField): -# return openapi.TYPE_INTEGER -# elif isinstance(field, models.FloatField): -# return openapi.TYPE_NUMBER -# elif isinstance(field, models.BooleanField): -# return openapi.TYPE_BOOLEAN -# elif isinstance(field, models.DateField): -# return openapi.TYPE_STRING # You can customize the date format if needed -# elif isinstance(field, models.DateTimeField): -# return openapi.TYPE_STRING # You can customize the datetime format if needed -# else: -# return openapi.TYPE_STRING # Default to string if the field type is not recognized - - -# print(get_model_schema(CrawlRequest)) - class CrawlerViewSet(viewsets.ModelViewSet): """ ViewSet that allows crawlers to be viewed, edited, updated and removed. diff --git a/api/views/task.py b/api/views/task.py index 07315f6f..a66a1bdd 100644 --- a/api/views/task.py +++ b/api/views/task.py @@ -20,11 +20,32 @@ class TaskViewSet(viewsets.ModelViewSet): serializer_class = TaskSerializer @swagger_auto_schema( - operation_summary="Obtêm todos agendamentos de coletas.", - operation_description="Este endpoint obtêm todos agendamentos de coletas.", - responses={ - 200: 'OK' - } + operation_summary='Obtêm todos agendamentos de coletas.', + operation_description='Retorna todas as configurações de agendamentos.', + # responses={ + # 200: openapi.Response( + # description='Lista de configuração de agendamento de coletas.', + # schema=openapi.Schema( + # type=openapi.TYPE_ARRAY, + # items=openapi.Schema( + # type=openapi.TYPE_OBJECT, + # properties={ + # 'id': openapi.Schema( + # type=openapi.TYPE_INTEGER, + # description='ID único do agendamento de coleta.' + # ), + # 'crawl_request': openapi.Schema( + # type=openapi.TYPE_INTEGER, + # description='ID único da requisição de coleta.' + # ), + # 'runtime': openapi.Schema( + # type=openapi.TYPE_INTEGER, + # description='Tempo de execução do agendamento de coleta.' + # } + # ) + # ) + # ), + # } ) def list(self, request, *args, **kwargs): return super().list(request, *args, **kwargs) diff --git a/crawler_manager/settings.py b/crawler_manager/settings.py index d25071ee..2afb68d4 100644 --- a/crawler_manager/settings.py +++ b/crawler_manager/settings.py @@ -16,7 +16,7 @@ KAFKA_SESSION_TIMEOUT_MS = 2 * 60 * 1000 # Redis host information -REDIS_HOST = os.getenv('REDIS_HOST', 'localhost') +REDIS_HOST = os.getenv('REDIS_HOST', 'redis') REDIS_PORT = int(os.getenv('REDIS_PORT', 6379)) REDIS_DB = int(os.getenv('REDIS_DB', 0)) REDIS_PASSWORD = os.getenv('REDIS_PASSWORD', None) diff --git a/main/staticfiles/js/crawler_list_grouped.js b/main/staticfiles/js/crawler_list_grouped.js index 34ce3b9e..6ce2caef 100644 --- a/main/staticfiles/js/crawler_list_grouped.js +++ b/main/staticfiles/js/crawler_list_grouped.js @@ -16,10 +16,13 @@ $(function(){ let $group_content = $($(e.target).find(".group-content")); let $loading = $($(e.target).find(".loading")); let $content_template = $($("#group-content-template").html()); + + let url = `${window.location.origin}/api/crawler/${crawler_id}/group`; + console.log('>>', url); - let ajax_request = $.ajax("get_crawlers_from_same_group/"+crawler_id); - ajax_request.done(function(response){ - let json_response = $.parseJSON(response); + let ajax_request = $.ajax(url); + + ajax_request.done(function (json_response){ let table = $content_template.clone(); $group_content.html(table); for (let i = 0; i < json_response.length; i++) { From fef360923e3a688c65b12c760b06cc41bf67c43f Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Tue, 20 Jun 2023 15:13:02 -0300 Subject: [PATCH 85/89] Swagger docs. for tasks completed. --- api/urls.py | 2 +- api/views/task.py | 206 ++++++++++++++++++---- main/staticfiles/js/scheduler/services.js | 11 +- main/urls.py | 1 - 4 files changed, 177 insertions(+), 43 deletions(-) diff --git a/api/urls.py b/api/urls.py index 57a85858..d5f628ea 100644 --- a/api/urls.py +++ b/api/urls.py @@ -69,7 +69,7 @@ # task info path('task/', views.TaskViewSet.as_view(list_and_create_actions), name='task'), path('task/', views.TaskViewSet.as_view(retrieve_update_and_destroy_actions), name='task-detail'), - path('task//filter', views.TaskViewSet.as_view({'get': 'filter'}), name='task-filter'), + path('task/filter', views.TaskViewSet.as_view({'get': 'filter'}), name='task-filter'), # queue info path('queue/', views.CrawlerQueueViewSet.as_view({'get': 'retrieve', 'put': 'update'}), name='queue'), diff --git a/api/views/task.py b/api/views/task.py index a66a1bdd..e35cc2aa 100644 --- a/api/views/task.py +++ b/api/views/task.py @@ -1,4 +1,5 @@ import json +import copy from datetime import datetime from rest_framework import viewsets, status @@ -14,7 +15,114 @@ import crawler_manager.crawler_manager as crawler_manager from crawler_manager.settings import TASK_TOPIC - + +TASK_SCHEMA = openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + 'id': openapi.Schema( + type=openapi.TYPE_INTEGER, + description='ID único do agendamento de coleta.' + ), + 'creation_date': openapi.Schema( + type=openapi.TYPE_STRING, + description='Data de criação do agendamento de coleta.' + ), + 'last_modified': openapi.Schema( + type=openapi.TYPE_STRING, + description='Data de atualização do agendamento de coleta.' + ), + 'crawl_request': openapi.Schema( + type=openapi.TYPE_INTEGER, + description='ID único da requisição de coleta que será executada.' + ), + 'crawler_name': openapi.Schema( + type=openapi.TYPE_STRING, + description='Nome do crawler que será executado.' + ), + 'runtime': openapi.Schema( + type=openapi.TYPE_STRING, + description='Data e horário base para começar o agendamento de coletas.' + \ + 'Após o primeiro agendamento, o próximo será calculado de acordo com o intervalo de repetição e o horário definido nesse atributo.' + ), + 'crawler_queue_behavior': openapi.Schema( + type=openapi.TYPE_STRING, + description='Define o que o agendador deve fazer com o coletor ao inserí-lo na fila de coletas, se irá executar' +\ + ' imediatamente (`run_immediately`), esperar na primeira (`wait_on_first_queue_position`) ou última posição ' +\ + '(`wait_on_last_queue_position`) de sua fila de coletas.', + default='wait_on_last_queue_position', + enum=['wait_on_last_queue_position', 'wait_on_first_queue_position', 'run_immediately'] + ), + 'repeat_mode': openapi.Schema( + type=openapi.TYPE_STRING, + description=''' + Define o tipo de repetição da coleta agendada. Pode ser: + - `no_repeat`: Não se repete. + - `daily`: Diariamente, na hora definida em `runtime`. + - `weekly`: Semanalmente, na mesma hora e dia da semana de sua primeira execução, definida em `runtime`. + - `monthly`: Mensalmente, na mesma hora e dia do mês de sua primeira execução, definida em `runtime`. Caso o mês não tenha o dia definido em `runtime`, a coleta ocorrerá no último dia do mês. + - `yearly`: Anualmente, na mesma hora e dia do ano de sua primeira execução, definida em `runtime`. Caso o ano não tenha o dia definido em `runtime`, a coleta ocorrerá no último dia do respectivo mês. + - `personalized`: Personalizado, de acordo com a configuração definida em `personalized_repetition_mode`. + ''', + default='no_repeat', + enum=['no_repeat', 'daily', 'weekly', 'monthly', 'yearly', 'personalized'] + ), + 'personalized_repetition_mode': openapi.Schema( + type=openapi.TYPE_OBJECT, + nullable=True, + description='Configuração de repetição personalizada. Deve ser definido apenas se `repeat_mode` for `personalized`.', + properties={ + 'type': openapi.Schema( + type=openapi.TYPE_STRING, + description='Tipo de repetição personalizada.', + enum=['daily', 'weekly', 'monthly', 'yearly'] + ), + 'interval': openapi.Schema( + type=openapi.TYPE_INTEGER, + description='Intervalo de repetição da coleta personalizada.' + ), + 'additional_data': openapi.Schema( + type=openapi.TYPE_OBJECT, + description='Dados adicionais para configuração da repetição personalizada.' + \ + 'Caso o tipo de repetição seja `weekly`, passe uma lista com os dias da semana' + \ + ' que o coletor deve ser executado, sendo domingo 0 e sábado 6. Exemplo: [0, 1, 2, 3, 4, 5, 6].' + \ + ' Caso o tipo de repetição seja `monthly`, passe um dicionário com os atributos `type`, que pode' + \ + ' ser `first-weekday`, `last-weekday` ou `day-x`, e `value`. Nesse último, informe ' + \ + ' o primeiro ou último dia da semana do mês que o coletor deve ser executado, ou o dia específico do mês, ' + \ + ' respectivamente. Exemplo: {"type": "first-weekday", "value": 0}, executará todo domingo do mês.', + nullable=True + ), + 'finish': openapi.Schema( + type=openapi.TYPE_OBJECT, + description='Como o agendamento do coletor deve ser finalizado.', + nullable=True, + properties={ + 'type': openapi.Schema( + type=openapi.TYPE_STRING, + description='Tipo de finalização do agendamento. Caso seja `never`, o agendamento não será finalizado.' + \ + ' Se for `occurrence`, o agendamento será interrompido após um número de ocorrências definido em `value`.' + \ + ' Se for `date`, o agendamento será interrompido após uma data definida em `value`.', + enum=['never', 'occurrence', 'date'], + default='never' + ), + 'value': openapi.Schema( + type=openapi.TYPE_STRING, + description='Valor de finalização do agendamento. Deve ser definido apenas se `type` for `occurrence` ou `date`.' + \ + ' Se for `occurrence`, informe o número de ocorrências que o agendamento deve executar antes de ser finalizado.' + \ + ' Se for `date`, informe a data em que o agendamento deve ser finalizado. O formato deve ser `YYYY-MM-DD`.' + ) + } + ) + } + ), + } +) + +TASK_SCHEMA_CREATE = copy.deepcopy(TASK_SCHEMA) + +TASK_SCHEMA_CREATE.properties.pop('id') +TASK_SCHEMA_CREATE.properties.pop('creation_date') +TASK_SCHEMA_CREATE.properties.pop('last_modified') + class TaskViewSet(viewsets.ModelViewSet): queryset = Task.objects.all() serializer_class = TaskSerializer @@ -22,30 +130,15 @@ class TaskViewSet(viewsets.ModelViewSet): @swagger_auto_schema( operation_summary='Obtêm todos agendamentos de coletas.', operation_description='Retorna todas as configurações de agendamentos.', - # responses={ - # 200: openapi.Response( - # description='Lista de configuração de agendamento de coletas.', - # schema=openapi.Schema( - # type=openapi.TYPE_ARRAY, - # items=openapi.Schema( - # type=openapi.TYPE_OBJECT, - # properties={ - # 'id': openapi.Schema( - # type=openapi.TYPE_INTEGER, - # description='ID único do agendamento de coleta.' - # ), - # 'crawl_request': openapi.Schema( - # type=openapi.TYPE_INTEGER, - # description='ID único da requisição de coleta.' - # ), - # 'runtime': openapi.Schema( - # type=openapi.TYPE_INTEGER, - # description='Tempo de execução do agendamento de coleta.' - # } - # ) - # ) - # ), - # } + responses={ + 200: openapi.Response( + description='Retorna todas as configurações de agendamento de coleta.', + schema=openapi.Schema( + type=openapi.TYPE_ARRAY, + items=TASK_SCHEMA + ) + ) + } ) def list(self, request, *args, **kwargs): return super().list(request, *args, **kwargs) @@ -56,15 +149,20 @@ def list(self, request, *args, **kwargs): manual_parameters=[ openapi.Parameter( name='id', - in_=openapi.IN_QUERY, + in_=openapi.IN_PATH, description='ID único do agendamento de coleta', required=True, type=openapi.TYPE_INTEGER ) ], responses={ - 200: 'OK', - 404: 'Not Found' + 200: openapi.Response( + description='Retorna a configuração do agendamento de coleta.', + schema=TASK_SCHEMA + ), + 404: openapi.Response( + description='Agendamento de coleta não encontrado.' + ) } ) def retrieve(self, request, pk=None): @@ -73,6 +171,7 @@ def retrieve(self, request, pk=None): @swagger_auto_schema( operation_summary="Cria um novo agendamento de coleta.", operation_description="Este endpoint cria um novo agendamento de coleta.", + request_body=TASK_SCHEMA_CREATE, responses={ 201: 'Created', 400: 'Bad Request' @@ -94,12 +193,13 @@ def create(self, request): manual_parameters=[ openapi.Parameter( name='id', - in_=openapi.IN_QUERY, + in_=openapi.IN_PATH, description='ID único do agendamento de coleta', required=True, type=openapi.TYPE_INTEGER ) ], + request_body=TASK_SCHEMA_CREATE, responses={ 200: 'OK', 400: 'Bad Request', @@ -122,15 +222,19 @@ def update(self, request, pk=None): manual_parameters=[ openapi.Parameter( name='id', - in_=openapi.IN_QUERY, + in_=openapi.IN_PATH, description='ID único do agendamento de coleta', required=True, type=openapi.TYPE_INTEGER ) ], responses={ - 204: 'No Content', - 404: 'Not Found' + 204: openapi.Response( + description='Agendamento de coleta removido com sucesso.' + ), + 404: openapi.Response( + description='Agendamento de coleta não encontrado.' + ) } ) def destroy(self, request, pk=None): @@ -163,21 +267,51 @@ def __str2date(self, s: str) -> datetime: openapi.Parameter( name='start_date', in_=openapi.IN_QUERY, - description='Data de início do intervalo', + description='Data de início do intervalo, no formato dd-mm-yyyy.', required=True, type=openapi.TYPE_STRING ), openapi.Parameter( name='end_date', in_=openapi.IN_QUERY, - description='Data de fim do intervalo', + description='Data de fim do intervalo, no formato dd-mm-yyyy.', required=True, type=openapi.TYPE_STRING ) ], responses={ - 200: 'OK', - 400: 'Bad Request' + 200: openapi.Response( + description='Retorna todos os agendamentos de coleta no intervalo especificado.', + schema=openapi.Schema( + type=openapi.TYPE_ARRAY, + description='Lista de agendamentos de coleta por data. Cada item da lista é um objeto com chave sendo dd-mm-yyyy ' + \ + 'e valor sendo uma lista de IDS de agendamentos de coleta para a data especificada.', + items=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + 'dd-mm-yyyy': openapi.Schema( + type=openapi.TYPE_ARRAY, + description='Lista de IDs de agendamentos para a data dd-mm-yyyy.', + items=openapi.Schema( + type=openapi.TYPE_INTEGER + ) + ), + } + ) + ) + ), + 400: openapi.Response( + description='As datas devem estar no formato dd-mm-yyyy e serem válidas.', + schema=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + 'message': openapi.Schema( + type=openapi.TYPE_STRING, + description='Mensagem de erro.' + ) + } + ) + ) } ) @action(detail=False) diff --git a/main/staticfiles/js/scheduler/services.js b/main/staticfiles/js/scheduler/services.js index de450742..c98b2acc 100644 --- a/main/staticfiles/js/scheduler/services.js +++ b/main/staticfiles/js/scheduler/services.js @@ -1,4 +1,5 @@ var services = {}; +var server_address = window.location.origin; services.sleep = function (ms) { var now = new Date().getTime(); @@ -9,7 +10,7 @@ services.save_new_scheduling = function (new_scheduling_config) { let parsed_data = JSON.stringify(new_scheduling_config); $.ajax({ - url: '/api/tasks/', + url: `${server_address}/api/task/`, type: 'post', contentType: "application/json; charset=utf-8", dataType: "json", @@ -28,7 +29,7 @@ services.save_new_scheduling = function (new_scheduling_config) { services.get_tasks_in_interval = function (start_date, end_date) { $.ajax({ - url: `/api/tasks/filter?start_date=${start_date}&end_date=${end_date}`, + url: `${server_address}/api/task/filter?start_date=${start_date}&end_date=${end_date}`, type: 'get', async: false, success: function (data) { @@ -46,7 +47,7 @@ services.get_tasks_in_interval = function (start_date, end_date) { services.get_task = function (task_id) { let task; $.ajax({ - url: `/api/tasks/${task_id}`, + url: `${server_address}/api/task/${task_id}`, type: 'get', async: false, success: function (data) { @@ -72,7 +73,7 @@ services.update_tasks = function (tarks_ids) { services.delete_task = function(task_id) { $.ajax({ - url: `/api/tasks/${task_id}`, + url: `${server_address}/api/task/${task_id}`, type: 'delete', async: false, success: function (data) { @@ -90,7 +91,7 @@ services.save_updated_scheduling = function (task_being_edited) { let parsed_data = JSON.stringify(task_being_edited); $.ajax({ - url: `/api/tasks/${task_id}/`, + url: `${server_address}/api/task/${task_id}/`, type: 'put', contentType: "application/json; charset=utf-8", dataType: "json", diff --git a/main/urls.py b/main/urls.py index 7d272d78..d6391537 100644 --- a/main/urls.py +++ b/main/urls.py @@ -23,7 +23,6 @@ path('grouped_crawlers', views.list_grouped_crawlers, name='list_grouped_crawlers'), # misc - path('monitoring/', views.monitoring, name='monitoring'), path('iframe/load', views.load_iframe, name='load_iframe'), path('list_process', views.list_process, name='list_process'), From 28c8f8dd220e34ff871961e5aa10309c86f46a54 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Tue, 20 Jun 2023 15:26:18 -0300 Subject: [PATCH 86/89] Swagger api doc address added to navbar --- api/views/task.py | 1 + main/staticfiles/img/swagger.svg | 60 ++++++++++++++++++++++++++++++++ main/templates/main/base.html | 5 +++ 3 files changed, 66 insertions(+) create mode 100644 main/staticfiles/img/swagger.svg diff --git a/api/views/task.py b/api/views/task.py index e35cc2aa..f88fcb61 100644 --- a/api/views/task.py +++ b/api/views/task.py @@ -327,6 +327,7 @@ def filter(self, request): start_date = None if 'start_date' in query_params: start_date = self.__str2date(query_params['start_date']) + if end_date is None or start_date is None: msg = {'message': 'You must send the params start_date and end_date, both in the format day-month-year' + ' in the query params of the url. Eg.: ?start_date=23-04-2023&end_date=01-01-2020, etc.'} diff --git a/main/staticfiles/img/swagger.svg b/main/staticfiles/img/swagger.svg new file mode 100644 index 00000000..454542f3 --- /dev/null +++ b/main/staticfiles/img/swagger.svg @@ -0,0 +1,60 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/main/templates/main/base.html b/main/templates/main/base.html index 5920031f..0f7ff08e 100644 --- a/main/templates/main/base.html +++ b/main/templates/main/base.html @@ -52,6 +52,11 @@
    + + + + + Wiki
    From fd70bcf8457c8f0142269366f0373641c693c348 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Tue, 20 Jun 2023 15:41:54 -0300 Subject: [PATCH 87/89] Reusing docker domain name --- crawler_manager/settings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crawler_manager/settings.py b/crawler_manager/settings.py index 2afb68d4..6a07b7a3 100644 --- a/crawler_manager/settings.py +++ b/crawler_manager/settings.py @@ -3,7 +3,7 @@ # Kafka host information KAFKA_TOPIC_PREFIX = os.getenv('KAFKA_TOPIC_PREFIX', 'crawler_ufmg') -KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'localhost:9092').split(',')] +KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'kafka:9092').split(',')] KAFKA_CONSUMER_AUTO_OFFSET_RESET = 'earliest' KAFKA_CONSUMER_TIMEOUT = 120000 KAFKA_CONSUMER_COMMIT_INTERVAL_MS = 5000 @@ -39,7 +39,7 @@ WRITER_TOPIC = os.getenv('WRITER_TOPIC', KAFKA_TOPIC_PREFIX + '.writer') STOPPED_SPIDER_NOTIFICATION_ADDRESS = os.getenv( - 'STOPPED_SPIDER_NOTIFICATION_ADDRESS', 'http://localhost:8000/detail/stop_crawl/{crawler_id}') + 'STOPPED_SPIDER_NOTIFICATION_ADDRESS', 'http://web:8000/detail/stop_crawl/{crawler_id}') TASK_TOPIC = os.getenv('TASK_TOPIC', KAFKA_TOPIC_PREFIX + 'task_topic') TASK_DATA_CONSUMER_GROUP = os.getenv('TASK_DATA_CONSUMER_DATA', KAFKA_TOPIC_PREFIX + '.task_data_group') From c1e2fea6d279512c27e820f2463e0255716061ed Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Tue, 27 Jun 2023 10:40:17 -0300 Subject: [PATCH 88/89] Fix merge errors --- api/urls.py | 2 + api/views/crawler.py | 56 ++++++- api/views/task.py | 6 +- main/crawling_timer.py | 2 +- main/models.py | 9 +- main/templates/main/detail_crawler.html | 2 +- main/urls.py | 1 + main/utils.py | 142 +++++++++++++----- scheduler/src/scheduler.py | 4 +- .../src/crawling/distributed_scheduler.py | 3 - 10 files changed, 173 insertions(+), 54 deletions(-) diff --git a/api/urls.py b/api/urls.py index d5f628ea..876a774e 100644 --- a/api/urls.py +++ b/api/urls.py @@ -35,6 +35,8 @@ path('crawler//run', views.CrawlerViewSet.as_view({'get': 'run'}), name='crawler-run'), path('crawler//stop', views.CrawlerViewSet.as_view({'get': 'stop'}), name='crawler-run'), path('crawler//group', views.CrawlerViewSet.as_view({'get': 'group'}), name='crawler-group'), + path('crawler//test/start', views.CrawlerViewSet.as_view({'get': 'test'}), name='crawler-test-start'), + path('crawler//test/stop', views.CrawlerViewSet.as_view({'get': 'stop'}), name='crawler-test-stop'), # instance path('instance/', views.CrawlerInstanceViewSet.as_view(only_list_action), name='instance'), diff --git a/api/views/crawler.py b/api/views/crawler.py index fd8360b8..2e435f6a 100644 --- a/api/views/crawler.py +++ b/api/views/crawler.py @@ -3,12 +3,13 @@ from rest_framework.response import Response from django.db import transaction +from django.conf import settings from main.models import CrawlRequest, ParameterHandler, ResponseHandler from main.serializers import CrawlRequestSerializer from main.utils import (add_crawl_request, unqueue_crawl_requests, - process_run_crawl, process_stop_crawl) + process_run_crawl, process_stop_crawl, process_start_test_crawler) from drf_yasg.utils import swagger_auto_schema from drf_yasg import openapi @@ -374,4 +375,55 @@ def group(self, request, pk): 'base_url': item.base_url, }) - return Response(json_data, status=status.HTTP_200_OK) \ No newline at end of file + return Response(json_data, status=status.HTTP_200_OK) + @swagger_auto_schema( + operation_summary='Testa o coletor.', + operation_description='Ao chamar por esse endpoint, o coletor começará o seu processo de teste.', + manual_parameters=[ + openapi.Parameter( + 'id', + openapi.IN_PATH, + description='ID único do crawler.', + type=openapi.TYPE_INTEGER + ), + openapi.Parameter( + 'runtime', + openapi.IN_QUERY, + description='Tempo de execução do teste em segundos.', + default=settings.RUNTIME_OF_CRAWLER_TEST, + type=openapi.TYPE_INTEGER + ), + ], + responses={ + 200: openapi.Response( + description='O processo de teste do coletor foi iniciado.' + ), + 400: openapi.Response( + description='Coletor não encontrado.', + schema=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + 'error': openapi.Schema( + type=openapi.TYPE_STRING, + description='Mensagem de erro.' + ) + } + ) + ) + } + ) + def test(self, request, pk): + try: + CrawlRequest.objects.get(pk=pk) + + except CrawlRequest.DoesNotExist: + return Response({'error': 'Coletor não encontrado.'}, status=status.HTTP_404_NOT_FOUND) + + runtime = int(request.query_params.get('runtime', settings.RUNTIME_OF_CRAWLER_TEST)) + + code, msg = process_start_test_crawler(pk, runtime) + + if code == settings.API_ERROR: + return Response({'error': msg}, status=status.HTTP_400_BAD_REQUEST) + + return Response({'message': msg}, status=status.HTTP_200_OK) diff --git a/api/views/task.py b/api/views/task.py index f88fcb61..06a17ad1 100644 --- a/api/views/task.py +++ b/api/views/task.py @@ -184,7 +184,7 @@ def create(self, request): 'action': 'create', 'data': response.data } - crawler_manager.message_sender.send(TASK_TOPIC, message) + crawler_manager.MESSAGE_SENDER.send(TASK_TOPIC, message) return response @swagger_auto_schema( @@ -213,7 +213,7 @@ def update(self, request, pk=None): 'action': 'update', 'data': response.data } - crawler_manager.message_sender.send(TASK_TOPIC, message) + crawler_manager.MESSAGE_SENDER.send(TASK_TOPIC, message) return response @swagger_auto_schema( @@ -246,7 +246,7 @@ def destroy(self, request, pk=None): 'id': pk } } - crawler_manager.message_sender.send(TASK_TOPIC, message) + crawler_manager.MESSAGE_SENDER.send(TASK_TOPIC, message) return response def __str2date(self, s: str) -> datetime: diff --git a/main/crawling_timer.py b/main/crawling_timer.py index f0111ce1..b682244d 100644 --- a/main/crawling_timer.py +++ b/main/crawling_timer.py @@ -12,7 +12,7 @@ def __init__(self, data_path: str, runtime: float = 300, server_address: str = 'http://web:8000', - stop_crawler_endpoint: str = '/api/crawlers/{crawler_id}/stop_test') -> None: + stop_crawler_endpoint: str = '/api/crawler/{crawler_id}/test/stop') -> None: self.crawler_id = crawler_id self.test_instance_id = test_instance_id diff --git a/main/models.py b/main/models.py index 0f8b4920..597d804b 100644 --- a/main/models.py +++ b/main/models.py @@ -8,6 +8,7 @@ from django.core.validators import MinValueValidator, RegexValidator from django.db import models from django.db.models.base import ModelBase +from django.conf import settings from django.utils import timezone from typing_extensions import Literal, TypedDict @@ -369,10 +370,10 @@ def last_instance(self): return None def __check_if_crawler_worked(self, instance_id) -> bool: - files_path = f'/data/{self.data_path}/{instance_id}/data/' - - raw_pages_crawled = os.listdir(files_path + 'raw_pages/') - files_crawled = os.listdir(files_path + 'files/') + files_path = os.path.join(settings.OUTPUT_FOLDER, self.data_path, str(instance_id), 'data') + + raw_pages_crawled = os.listdir(files_path + '/raw_pages/') + files_crawled = os.listdir(files_path + '/files/') for ignore_file in ('file_description.jsonl', 'temp', 'browser_downloads'): if ignore_file in raw_pages_crawled: diff --git a/main/templates/main/detail_crawler.html b/main/templates/main/detail_crawler.html index 4599c518..19747d21 100644 --- a/main/templates/main/detail_crawler.html +++ b/main/templates/main/detail_crawler.html @@ -394,7 +394,7 @@ var CRAWLER_QUEUE_API_ADDRESS = SERVER_ADDRESS + '/api/crawler_queue/1/'; var RUNNING_TEST_MODE = {% if running_test_mode %} true {% else %} false {% endif %}; - var TEST_STARTED_AT = {{ test_started_at }}; + var TEST_STARTED_AT = {% if test_started_at %} {{ test_started_at }} {% else %} null {% endif %}; var TEST_RUNTIME = {{ test_runtime }}; diff --git a/main/urls.py b/main/urls.py index d6391537..b0439913 100644 --- a/main/urls.py +++ b/main/urls.py @@ -17,6 +17,7 @@ path('edit_group//', views.edit_grouped_crawlers, name='edit_grouped_crawlers'), path('delete//', views.delete_crawler, name='delete_crawler'), + path("test//", views.test_crawler, name="test_crawler"), # grouped crawlers path('new_group/', views.create_grouped_crawlers, name='create_grouped_crawlers'), diff --git a/main/utils.py b/main/utils.py index 53781edd..effdd1f8 100644 --- a/main/utils.py +++ b/main/utils.py @@ -1,16 +1,24 @@ import os import subprocess -from typing_extensions import Literal from django.db import transaction +from django.db.models import Q from django.utils import timezone +from django.conf import settings + +from typing_extensions import Literal import crawler_manager.crawler_manager as crawler_manager -from crawler_manager.settings import OUTPUT_FOLDER -from main.models import CrawlRequest, CrawlerInstance, CrawlerQueue, CrawlerQueueItem -from main.forms import ParameterHandlerFormSet, ResponseHandlerFormSet +from crawler_manager.crawler_manager import LOG_WRITER +from crawler_manager.settings import OUTPUT_FOLDER, WRITER_TOPIC +from main.forms import ParameterHandlerFormSet, ResponseHandlerFormSet +from main.models import (CrawlerInstance, CrawlerQueue, CrawlerQueueItem, + CrawlRequest, Log) +from main.crawling_timer import CrawlingTimer +from main.serializers import CrawlerInstanceSerializer, CrawlRequestSerializer + CRAWLER_QUEUE = None try: @@ -22,14 +30,24 @@ except: pass +class NoInstanceRunningException(Exception): + pass + def update_queue_instance(): global CRAWLER_QUEUE CRAWLER_QUEUE = CrawlerQueue.object() -def create_instance(crawler_id, instance_id): +def create_instance(crawler_id, instance_id, test_mode): mother = CrawlRequest.objects.filter(id=crawler_id) + if test_mode: + mother[0].ignore_data_crawled_in_previous_instances = False + obj = CrawlerInstance.objects.create( - crawler=mother[0], instance_id=instance_id, running=True) + crawler=mother[0], + instance_id=instance_id, + execution_context = 'testing' if test_mode else 'crawling', + running=True) + return obj def add_crawl_request(crawler_id, wait_on: Literal['last_position', 'first_position', 'no_wait'] = 'last_position'): @@ -73,6 +91,28 @@ def add_crawl_request(crawler_id, wait_on: Literal['last_position', 'first_posit queue_type=cr_expec_runtime_cat) queue_item.save() +def delete_instance_and_logs(instance_id: int): + # Ignore logs associated with the instance being deleted + LOG_WRITER.add_instance_to_ignore(instance_id) + + # Remove the logs and the test instance + logs = Log.objects.filter(instance_id=instance_id) + logs.delete() + CrawlerInstance.objects.get(instance_id=instance_id).delete() + + # Free memory + LOG_WRITER.remove_instance_to_ignore(instance_id) + +def delete_instance_crawled_folder(data_path: str, instance_id: int): + message = { + 'delete_folder': { + 'data_path': data_path, + 'instance_id': str(instance_id) + } + } + + crawler_manager.MESSAGE_SENDER.send(WRITER_TOPIC, message) + def unqueue_crawl_requests(queue_type: str, update_queue: bool = False): if update_queue: update_queue_instance() @@ -107,7 +147,7 @@ def process_stop_crawl(crawler_id, from_sm_listener: bool = False): # No instance running if instance is None: - raise ValueError("No instance running") + raise NoInstanceRunningException("No instance running") if from_sm_listener and not instance.download_files_finished(): instance.page_crawling_finished = True @@ -116,13 +156,18 @@ def process_stop_crawl(crawler_id, from_sm_listener: bool = False): return instance_id = instance.instance_id - config = CrawlRequest.objects.filter(id=int(crawler_id)).values()[0] + running_crawler_test = instance.execution_context == 'testing' + + crawler = CrawlRequest.objects.get(id=int(crawler_id)) + crawler.update_functional_status_after_run(instance_id) - # FIXME: Colocar esse trecho de código no módulo writer - # computa o tamanho em kbytes do diretório "data" - instance_path = os.path.join(OUTPUT_FOLDER, config['data_path'], str(instance_id), 'data') + if running_crawler_test: + delete_instance_and_logs(instance_id) + delete_instance_crawled_folder(crawler.data_path, instance_id) - command_output = subprocess.run(["du " + instance_path + " -d 0"], shell=True, stdout=subprocess.PIPE) + instance_path = os.path.join(OUTPUT_FOLDER, crawler.data_path, str(instance_id), 'data') + + command_output = subprocess.run(['du ' + instance_path + ' -d 0'], shell=True, stdout=subprocess.PIPE) output_line = command_output.stdout.decode('utf-8').strip('\n') parts = output_line.split('\t') data_size_kbytes = int(parts[0]) @@ -131,7 +176,6 @@ def process_stop_crawl(crawler_id, from_sm_listener: bool = False): num_data_files = instance.number_files_success_download instance = None - instance_info = {} queue_type = None with transaction.atomic(): @@ -147,15 +191,6 @@ def process_stop_crawl(crawler_id, from_sm_listener: bool = False): queue_type = queue_item.queue_type queue_item.delete() - # As soon as the instance is created, it starts to collect and is only modified when it stops, - # we use these fields to define when a collection started and ended - instance_info["started_at"] = str(instance.creation_date) - instance_info["finished_at"] = str(instance.last_modified) - instance_info["data_size_kbytes"] = data_size_kbytes - instance_info["num_data_files"] = num_data_files - - crawler_manager.update_instances_info( - config["data_path"], str(instance_id), instance_info) crawler_manager.stop_crawler(crawler_id) @@ -168,30 +203,36 @@ def remove_crawl_request(crawler_id): queue_item = CrawlerQueueItem.objects.get(crawl_request_id=crawler_id) queue_item.delete() -def process_run_crawl(crawler_id): +def process_run_crawl(crawler_id, test_mode = False): instance = None - instance_info = dict() - crawler_entry = CrawlRequest.objects.filter(id=crawler_id) - data = crawler_entry.values()[0] + crawler = CrawlRequest.objects.get(pk=crawler_id) + data = CrawlRequestSerializer(crawler).data + + # delete unnecessary data + if 'instances' in data: + del data['instances'] # Instance already running - if crawler_entry.get().running: - instance_id = crawler_entry.get().running_instance.instance_id + if crawler.running: + instance_id = crawler.running_instance.instance_id raise ValueError("An instance is already running for this crawler " f"({instance_id})") - data = CrawlRequest.process_config_data(crawler_entry.get(), data) - data["instance_id"] = crawler_manager.gen_key() - instance = create_instance(data['id'], data["instance_id"]) - crawler_manager.start_crawler(data.copy()) + data = CrawlRequest.process_config_data(crawler, data) + + instance_id = crawler_manager.gen_key() + instance = create_instance(crawler_id, instance_id, test_mode) - instance_info["started_at"] = str(instance.creation_date) - instance_info["finished_at"] = None + data['instance_id'] = instance_id + data['execution_context'] = instance.execution_context + data['running_in_test_mode'] = test_mode - crawler_manager.update_instances_info( - data["data_path"], str(data["instance_id"]), instance_info) + crawler.functional_status = 'testing' if test_mode else 'testing_by_crawling' + crawler.save() + + crawler_manager.start_crawler(data) return instance @@ -260,8 +301,8 @@ def generate_injector_forms(*args, filter_queryset=False, **kwargs): crawler = kwargs.get('instance') if crawler is None: - raise ValueError("If the filter_queryset option is True, the " + - "instance property must be set.") + raise ValueError('If the filter_queryset option is True, the ' + + 'instance property must be set.') queryset = crawler.parameter_handlers @@ -275,3 +316,28 @@ def generate_injector_forms(*args, filter_queryset=False, **kwargs): prefix='templated_url-responses', queryset=queryset, **kwargs) return parameter_formset, response_formset + +def process_start_test_crawler(crawler_id: int, runtime: float) -> dict: + instance = None + try: + instance = process_run_crawl(crawler_id, True) + + except Exception as e: + return settings.API_ERROR, str(e) + + try: + + test_instance_id = CrawlerInstanceSerializer(instance).data['instance_id'] + + crawler = CrawlRequest.objects.get(pk=crawler_id) + data_path = crawler.data_path + + server_address = 'http://localhost:8000' + + crawling_timer = CrawlingTimer(crawler_id, test_instance_id, data_path, runtime, server_address) + crawling_timer.start() + + return settings.API_SUCCESS, f'Testing {crawler_id} for {runtime}s' + + except Exception as e: + return settings.API_ERROR, str(e) \ No newline at end of file diff --git a/scheduler/src/scheduler.py b/scheduler/src/scheduler.py index a749aaae..16342fed 100644 --- a/scheduler/src/scheduler.py +++ b/scheduler/src/scheduler.py @@ -13,11 +13,11 @@ SERVER_SESSION = requests.sessions.Session() def run_crawler(crawler_id, action): - SERVER_SESSION.get(settings.RUN_CRAWLER_URL + "/api/crawlers/{}/run?action={}".format(crawler_id, action)) + SERVER_SESSION.get(settings.RUN_CRAWLER_URL + "/api/crawler/{}/run?action={}".format(crawler_id, action)) print(f'[{datetime.now()}] [TC] Crawler {crawler_id} processed by schedule...') def run_crawler_once(crawler_id, action): - SERVER_SESSION.get(settings.RUN_CRAWLER_URL + "/api/crawlers/{}/run?action={}".format(crawler_id, action)) + SERVER_SESSION.get(settings.RUN_CRAWLER_URL + "/api/crawler/{}/run?action={}".format(crawler_id, action)) print(f'[{datetime.now()}] [TC] Crawler {crawler_id} processed by schedule...') return schedule.CancelJob diff --git a/spider_manager/src/crawling/distributed_scheduler.py b/spider_manager/src/crawling/distributed_scheduler.py index d1be47d0..8a806314 100644 --- a/spider_manager/src/crawling/distributed_scheduler.py +++ b/spider_manager/src/crawling/distributed_scheduler.py @@ -482,9 +482,6 @@ def enqueue_request(self, request): domain not in self.black_domains)) and \ (req_dict['meta']['expires'] == 0 or curr_time < req_dict['meta']['expires']): - - print( - f'Dupefilter {request.url} @ seen? {request_seen}') if not request_seen: notify_new_page_found(req_dict['meta']['attrs']['instance_id']) From 40c7bbbf11ffdc451a9580ea514a9946527280b9 Mon Sep 17 00:00:00 2001 From: Elves Rodrigues Date: Tue, 27 Jun 2023 15:49:57 -0300 Subject: [PATCH 89/89] Some bugs fixed --- api/views/crawler.py | 20 +++++++++++++++++++- scheduler/src/settings.py | 6 +++--- src/schedule/schedule/function_wrapper.py | 5 ----- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/api/views/crawler.py b/api/views/crawler.py index 2e435f6a..a4055e12 100644 --- a/api/views/crawler.py +++ b/api/views/crawler.py @@ -1,3 +1,5 @@ +from datetime import datetime + from rest_framework import status, viewsets from rest_framework.decorators import action from rest_framework.response import Response @@ -5,7 +7,7 @@ from django.db import transaction from django.conf import settings -from main.models import CrawlRequest, ParameterHandler, ResponseHandler +from main.models import CrawlRequest, ParameterHandler, ResponseHandler, Task from main.serializers import CrawlRequestSerializer from main.utils import (add_crawl_request, unqueue_crawl_requests, @@ -217,6 +219,13 @@ def create(self, request, *args, **kwargs): default='wait_on_last_queue_position', enum=['run_immediately', 'wait_on_first_queue_position', 'wait_on_last_queue_position'], required=False + ), + openapi.Parameter( + 'next_run', + openapi.IN_QUERY, + description='Esse parâmetro permite definir a próxima data de execução do coletor. Deve ser informado no formato `YYYY-MM-DD HH:MM:SS`.', + type=openapi.TYPE_STRING, + required=False ) ], ) @@ -225,6 +234,15 @@ def run(self, request, pk): query_params = self.request.query_params.dict() action = query_params.get('action', 'wait_on_last_queue_position') + # check if there is a task for this crawler + task = Task.objects.filter(crawl_request__pk=pk).first() + + if task: + next_run = query_params.get('next_run') + task.next_run = datetime.strptime(next_run, '%Y-%m-%d %H:%M:%S') if next_run else None + task.last_run = datetime.now() + task.save() + if action == 'run_immediately': wait_on = 'no_wait' diff --git a/scheduler/src/settings.py b/scheduler/src/settings.py index 4f782a8e..114b0be2 100644 --- a/scheduler/src/settings.py +++ b/scheduler/src/settings.py @@ -1,6 +1,6 @@ import os -KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'localhost:9092').split(',')] +KAFKA_HOSTS = [x.strip() for x in os.getenv('KAFKA_HOSTS', 'kafka:9092').split(',')] KAFKA_TOPIC_PREFIX = os.getenv('KAFKA_TOPIC_PREFIX', 'crawler_ufmg') KAFKA_CONSUMER_AUTO_OFFSET_RESET = 'earliest' KAFKA_CONSUMER_TIMEOUT = 120000 @@ -14,12 +14,12 @@ KAFKA_SESSION_TIMEOUT_MS = 2 * 60 * 1000 TASK_TOPIC = os.getenv('TASK_TOPIC', KAFKA_TOPIC_PREFIX + 'task_topic') TASK_DATA_CONSUMER_GROUP = os.getenv('TASK_DATA_CONSUMER_DATA', KAFKA_TOPIC_PREFIX + '.task_data_group') -RUN_CRAWLER_URL = "http://localhost:8000" +RUN_CRAWLER_URL = "http://web:8000" WAIT_TIME = 1 MAX_WAIT_TIME = 60 -DB_HOST = os.getenv('DB_HOST', 'localhost') +DB_HOST = os.getenv('DB_HOST', 'db') DB_PORT = os.getenv('DB_PORT', '5432') DB_USER = os.getenv('POSTGRES_USER', 'django') DB_PASS = os.getenv('POSTGRES_PASSWORD', 'c01_password') diff --git a/src/schedule/schedule/function_wrapper.py b/src/schedule/schedule/function_wrapper.py index 61b1cb89..1b0be787 100644 --- a/src/schedule/schedule/function_wrapper.py +++ b/src/schedule/schedule/function_wrapper.py @@ -11,11 +11,6 @@ def __init__(self, funct: Callable, *args, **kwargs): def __call__(self, next_run: datetime = None) -> Any: # check if the funct accepts a next_run argument - print('-' * 15) - print(f'The funct {self.funct} requires next_run: {self.funct_requires_next_run()}') - print(f'next_run: {next_run}') - print('-' * 15) - if self.funct_requires_next_run(): self.kwargs["next_run"] = next_run