Blog

Web Scraping With Google Apps Script

The script below allows you to automatically find and download information from one of the freelance exchanges.

The domain name is taken from the Google Spread Sheets page. And search results are also uploaded there.

The function scraper() , which contains two cycles, is used as a control script. In the first loop ( for-loop ), we access the web pages of the site and save them in the variable html . In the second loop ( while-loop ), this variable is sequentially processed using three auxiliary functions:

  • The getBlock function finds the part of the html code (code block) inside the tag (usually by the unique value of the attributes of this tag), and returns this block as a string value;
  • The deleteBlock function, on the contrary, deletes the found fragment of the html code inside the block and also returns the remainder of this block as a string value.
  • Unlike the first two functions, the getOpenTag function does not delete the found tag, but returns it as a string value. True, not the whole tag, but only the first (opening part) of this tag.

function scraper() {
  
  const ss = SpreadsheetApp.getActiveSpreadsheet().getSheetByName('Sheet1');
  const urlDomain = ss.getRange(1, 1).getValue();
  
  let url = urlDomain;
  let count = 0;
  for (let page = 1; page < 5; page++) {
    url = urlDomain + page + '/';
    if (page == 1) url = urlDomain;
    let response = UrlFetchApp.fetch(url);
    ss.getRange(2, 1).setValue(response.getResponseCode());
    let html = response.getContentText();
    let p = 0;
    while (true) {
    
      let out = getBlock(html, 'div', html.indexOf('class="JobSearchCard-primary"', p));
      let block = out[0];
      p = out[1] + 1;
      if (p == 0) break;
      
      let title1 = getBlock(block, 'div', 0)[0];
      let title = getBlock(title1, 'a', 0)[0];
      
      let link = getOpenTag(title1, 'a', 0);
      link = getAttrName(link, 'href', 0)
      let formula = '=HYPERLINK("https://www.freelancer.com' +link + '", "' + title + '")';
      ss.getRange(3 + 3 * count, 2).setValue(formula);
      
      let price = getBlock(block, 'div', block.indexOf('class="JobSearchCard-primary-price'))[0];
      if (price.includes('span')) price =  deleteBlock(price, 'span', price.indexOf('span'));
      ss.getRange(3 + 3 * count + 1, 2).setValue(price).setHorizontalAlignment('right');
      
      let description = getBlock(block, 'p', block.indexOf('class="JobSearchCard-primary-description"'))[0];
      ss.getRange(3 + 3 * count, 1, 3).mergeVertically().setValue(description)
          .setBorder(true, true, true, true, null, null, '#000000', SpreadsheetApp.BorderStyle.SOLID)
          .setVerticalAlignment('middle')
          .setWrapStrategy(SpreadsheetApp.WrapStrategy.WRAP);
  
      ss.getRange(3 + 3 * count, 2, 3).setBorder(true, true, true, true, null, null, '#000000', SpreadsheetApp.BorderStyle.SOLID);
      
      let cat = getBlock(block, 'div', block.indexOf('class="JobSearchCard-primary-tags"'))[0];
      cat = cat.split('</a>').map(item => item.split('>')[1]);
      cat.pop();
      cat = cat.join(', ');
      ss.getRange(3 + 3 * count + 2, 2).setValue(cat);
    
      count++;
    
    }; 
  };
}

function getAttrName(html, attr, i) {
  let idxStart = html.indexOf(attr , i);
  if (idxStart == -1) return "Can't to find attr " + attr + ' !';
  idxStart = html.indexOf('"' , idxStart) + 1;
  let idxEnd = html.indexOf('"' , idxStart);
  return html.slice(idxStart,idxEnd).trim();
}

function getOpenTag(html, tag, idxStart) {
  let openTag = '<' + tag;
  let lenOpenTag = openTag.length;
  // where we are?
  if (html.slice(idxStart, idxStart + lenOpenTag) != openTag) {
    idxStart = html.lastIndexOf(openTag, idxStart);
    if (idxStart == -1) return "Can't to find openTag " + openTag + ' !';
  };
  // begin loop after openTag
  let idxEnd = html.indexOf('>', idxStart) + 1;
  if (idxStart == -1) return "Can't to find closing bracket '>' for openTag!";
  return html.slice(idxStart,idxEnd).trim();
}

function deleteBlock(html, tag, idxStart) { // delete opening & closing tag and info between them
  let openTag = '<' + tag;
  let lenOpenTag = openTag.length;
  let closeTag = '</' + tag + '>';
  let lenCloseTag = closeTag.length;
  let countCloseTags = 0;
  let iMax = html.length;
  let idxEnd = 0;
  // where we are?
  if (html.slice(idxStart, idxStart + lenOpenTag) != openTag) {
    idxStart = html.lastIndexOf(openTag, idxStart);
    if (idxStart == -1) return ["Can't to find openTag " + openTag + ' !', -1];
  };
  // begin loop after openTag
  let i = html.indexOf('>') + 1;
  
  while (i <= iMax) {
    i++;
    if (i === iMax) {
      return ['Could not find closing tag for ' + tag, -1];
    };
    let carrentValue = html[i];
    if (html[i] === '<'){
      let closingTag = html.slice(i, i + lenCloseTag);
      let openingTag = html.slice(i, i + lenOpenTag);
      if (html.slice(i, i + lenCloseTag) === closeTag) {
        if (countCloseTags === 0) {
          idxEnd = i + lenCloseTag;
          break;
        } else {
          countCloseTags -= 1;
        };
      } else if (html.slice(i, i + lenOpenTag) === openTag) {
        countCloseTags += 1;
      };
    };
  };
  return (html.slice(0, idxStart) + html.slice(idxEnd, iMax)).trim();
}

function getBlock(html, tag, idxStart) {  // <tag .... > Block </tag>
  let openTag = '<' + tag;
  let lenOpenTag = openTag.length;
  let closeTag = '</' + tag + '>';
  let lenCloseTag = closeTag.length;
  let countCloseTags = 0;
  let iMax = html.length;
  let idxEnd = 0;
  // where we are?
  if (html.slice(idxStart, idxStart + lenOpenTag) != openTag) {
    idxStart = html.lastIndexOf(openTag, idxStart);
    if (idxStart == -1) return ["Can't to find openTag " + openTag + ' !', -1];
  };
  // change start - will start after openTag!
  idxStart = html.indexOf('>', idxStart) + 1;
  let i = idxStart;
  
  while (i <= iMax) {
    i++;
    if (i === iMax) {
      return ['Could not find closing tag for ' + tag, -1];
    };
    let carrentValue = html[i];
    if (html[i] === '<'){
      let closingTag = html.slice(i, i + lenCloseTag);
      let openingTag = html.slice(i, i + lenOpenTag);
      if (html.slice(i, i + lenCloseTag) === closeTag) {
        if (countCloseTags === 0) {
          idxEnd = i - 1;
          break;
        } else {
          countCloseTags -= 1;
        };
      } else if (html.slice(i, i + lenOpenTag) === openTag) {
        countCloseTags += 1;
      };
    };
  };
  return [html.slice(idxStart,idxEnd + 1).trim(), idxEnd];
}

You can find more information in this video (RU voice):