The method of parsing (scraping), discussed in the article Parsing (scraping) using Google Apps Script is incredibly simple and easy to use. There is no need for complex programs, or high qualifications or special knowledge, or a lot of experience. All code is written directly in the Google Sheets editor, and then placed and run there.
In addition to all this, the script can be run on a schedule at any time in 24/7 mode, and the result can be processed right there and, if necessary, sent by email or a new file with the received and processed data can be created on Google disk.
However, there may be problems here. First of all, this script does not work with sites (more precisely, with pages) where pre-registration is required. Since, to register on the site, you need to create a session and place its code in special files - cookies. However, the standard UrlFetchApp function used in Apps Script to access a website does not allow cookies.
There is another problem, which has recently manifested itself more and more often. This is the unwillingness of site owners to share information with bots. Therefore, you can often see a picture when the html-code issued by the server at the request of the script is significantly different from the code that is issued to the browser.
Actually, precisely in order to quickly find out whether it is possible to parse (scrap) the site we need or not, this program was written:
function onOpen() {
var ui = SpreadsheetApp.getUi();
// Or DocumentApp or FormApp.
ui.createMenu('Custom Menu')
.addItem('Check', 'fastAnalyzer')
.addSeparator()
.addItem('Create Sheeet CHECK', 'createSheet')
.addToUi();
}
function fastAnalyzer() {
const ss = SpreadsheetApp.getActiveSpreadsheet().getSheetByName('Check');
const urlDomain = ss.getRange(2, 1).getValue();
let url = urlDomain;
let count = 0;
let response = UrlFetchApp.fetch(url);
let lll = response.getContentText().length;
ss.getRange(2, 2).setValue(response.getResponseCode());
ss.getRange(2, 3).setValue(lll);
let html = response.getContentText();
let whatLookingFor = ss.getRange(3, 2).getValue();
let pos = html.indexOf(whatLookingFor);
ss.getRange(3, 3).setValue(pos);
ss.getRange(4, 1, ss.getLastRow(), 2).clearContent();
let totalNumber = html.length
let i = 0;
// Check defoult value of delta - number of symbols in 1 row
// If the cell is empty, print the defole value = 5000
let delta = Number.parseInt(ss.getRange(2, 4).getValue());
if (isNaN(delta)) {
delta = 5000
// Print defoult value of number of symbols in 1 row
ss.getRange(2, 4).setValue(delta);
};
let iStart = 0;
while (true) {
let iEnd = iStart + delta;
if (iEnd > totalNumber) iEnd = totalNumber;
ss.getRange(4 + i, 1).setValue("Символы от " + iStart + " до " + iEnd);
ss.getRange(4 + i, 2).setValue(html.slice(iStart,iEnd));
// let currentRow = (4+i) + ":" + (4+i)
// ss.getRange(3, 3).setValue(currentRow);
// ss.getRange(currentRow).activate();
// ss.setRowHeight(4+i, 200);
i++;
if (iEnd - iStart < delta) break;
iStart = iEnd;
};
}
function createSheet() {
// is exist Check sheet?
let s = SpreadsheetApp.getActiveSpreadsheet();
// create if not exist
if(!s.getSheetByName('Check')) {
s.insertSheet('Check', 0);
ssc = s.getSheetByName('Check');
ssc.getRange(1, 1).setValue("Url сайта");
ssc.getRange(1, 2).setValue("Код ответа сайта");
ssc.getRange(1, 3).setValue("Число символов на выбранной странице");
ssc.getRange(1, 4).setValue("Изменить вывод числа знаков в однй строке");
ssc.getRange(2, 1).setBackground('ACCENT3'); // yellow background cell
ssc.getRange(3, 1).setValue("Какой элемент кода ищем?");
ssc.getRange(3, 2).setBackground('ACCENT3'); // yellow background cell
};
}
So, all you need to work with this program is
The program is ready to go! All that is needed now is to add the site url to the Google tables page, the code element that you want to find on this page, and run the script. The answer will be ready in a second!
You can find more information in this video (RU voice):