Agent/AuctionServices/Scraper/Interencheres/InterencheresData.js

395 lines
13 KiB
JavaScript

// interencheresData.js
const {ScraperTools} = require('../Scraper');
const urlModule = require('url');
const moment = require('moment-timezone');
const { Console } = require('console');
const { title } = require('process');
class InterencheresData extends ScraperTools {
_Name = 'interencheres'
getUrlInfo = async (url) => {
let parsedUrl = new urlModule.URL(url);
let pathParts = parsedUrl.pathname.split('/').filter(Boolean);
let typeSale = pathParts[0];
let saleID = pathParts[1].split('-')[pathParts[1].split('-').length-1];
// if lot URL
let lotID = 0;
if(pathParts.length > 2){
lotID = pathParts[2].split('-')[1].split('.')[0];
}
return {
'typeSale': typeSale,
'saleID': saleID,
'lotID': lotID
}
}
// ## Lot
getPictures = async (page, Url) => {
let PictList = []
function checkDup() {
const toFindDuplicates = array => array.filter((item, index) => array.indexOf(item) !== index)
const duplicateElements = toFindDuplicates(PictList);
// if dupplicate pictures added in the array
if (duplicateElements.length > 0) {
// remove diplucated content
PictList = PictList.filter(function (elem, pos) {
return PictList.indexOf(elem) == pos;
})
// stop the process
return false
// no dupplicated picture
} else {
// continue the process
return true
}
}
page.on('response', async response => {
const url = response.url();
if (url.match("thumbor-indbupload.interencheres.com")) {
response.buffer().then(file => {
console.log("push "+url)
PictList.push(url)
});
}
});
console.log('go to : '+Url)
await page.goto(Url);
const picturesNumberXPath = [
"//div[contains(@class, 'pswp__counter')]"
]
let picturesNumberString = await this.getTextContent(picturesNumberXPath, page, 'picturesNumberXPath')
let picturesNumber = 100;
if(picturesNumberString != ''){
picturesNumber = parseInt(picturesNumberString.split(" / ")[1])
console.log('picturesNumber : '+picturesNumber)
}
let condition = true
let idx = 0
do {
const ButtonNextXPath = [
"//button[contains(@class, 'pswp__button--arrow--right')]"
]
await this.clickLink(ButtonNextXPath, page, 'ButtonNextXPath')
await page.waitForTimeout(300);
idx++
// if number of pictures found or 20 pictures checked
if(idx+1 == picturesNumber || idx==20)condition = false
} while (condition);
checkDup()
return PictList
}
getLotNumber = async (page) => {
const lotNumberXPath = [
'/html/body/div[1]/div/div/div/main/div/div/div[2]/div/div[1]/div[2]/div[2]/div[1]',
'/html/body/div[1]/div/div/div[1]/main/div/div/div[2]/div/div[1]/div[2]/div[1]/div[1]'
]
let lotNumberString = await this.getTextContent(lotNumberXPath, page, 'lotNumberXPath')
let lotNumber = '';
if(lotNumberString != ''){
lotNumber = lotNumberString.replace('Lot ', '');
}
return lotNumber
}
getEstimate = async (page) => {
const EstimateXPath = [
'/html/body/div[1]/div/div/div/main/div/div/div[2]/div/div[1]/div[2]/div[3]/div[2]/span',
'/html/body/div[1]/div/div/div[1]/main/div/div/div[2]/div/div[1]/div[2]/div[2]/div[2]/span'
]
let EstimateString = await this.getTextContent(EstimateXPath, page, 'EstimateXPath')
console.log('EstimateString : '+EstimateString)
let EstimateLow = 0
let EstimateHigh = 0
if(EstimateString != ''){
let matches = EstimateString.match(/(\d{1,3}(?:\s\d{3})*)/g);
if (matches) {
if (matches.length >= 2) {
EstimateLow = parseInt(matches[0].replace(/\s/g, ''), 10);
EstimateHigh = parseInt(matches[1].replace(/\s/g, ''), 10);
console.log('Low:', EstimateLow);
console.log('High:', EstimateHigh);
} else
if(matches.length == 1){
EstimateLow = parseInt(matches[0].replace(/\s/g, ''), 10);
EstimateHigh = 0;
} else {
console.log('Could not extract numbers.');
}
}
}
return {EstimateLow, EstimateHigh}
}
getDescription = async (page) => {
const DescriptionXPath = [
'//div[contains(@class, "description")]',
'/html/body/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[1]/div[2]/div[4]/div',
'/html/body/div[1]/div/div/div[1]/main/div/div/div[2]/div/div[1]/div[2]/div[4]/div',
'/html/body/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[1]/div[2]/div[3]/div'
]
let Description = await this.getTextContent(DescriptionXPath, page, 'DescriptionXPath')
return Description
}
getFees = async (page) => {
let feesText = ''
let fees = 0
const ButtonFeesXPath = [
'.//a[contains(text(),"Frais de vente")]',
'/html/body/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[1]/div[2]/div[3]/div[2]/a',
'/html/body/div[1]/div/div/div[1]/main/div/div/div[2]/div/div[1]/div[2]/div[3]/div[2]/a',
'/html/body/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[1]/div[2]/div[2]/div[2]/a'
]
if(await this.clickLink(ButtonFeesXPath, page, 'ButtonFeesXPath')){
const FeesXPath = [
"//strong[contains(text(), 'Frais de vente :')]/following-sibling::span",
'/html/body/div[1]/div/div/div[3]/div/div/div[2]/div/p[1]/p[1]/span',
]
feesText = await this.getTextContent(FeesXPath, page, 'FeesXPath')
if(feesText != ''){
feesText = feesText.replace(/[\n]/g, '').replace(/\s+/g, ' ').trim();
let matches = feesText.match(/(\d+(\.\d+)?)/)
if (matches) {
fees = matches[0];
}
}
}
return {feesText, fees}
}
getLotID = async (url) => {
let UrlInfo = await this.getUrlInfo(url);
let id_lot = UrlInfo.lotID
console.log('id_lot : '+id_lot)
return id_lot
}
getSaleIdUrl = async (url) => {
let UrlInfo = await this.getUrlInfo(url);
let id_sale = UrlInfo.saleID
let urlSale = url
// remove lot information if present
if (urlSale.includes('/lot-')) {
urlSale = url.split("/lot-")[0]
}
// remove parameters
if (urlSale.includes('?')) {
urlSale = urlSale.split("?")[0];
}
console.log('getSaleIdUrl urlSale : '+urlSale)
return {id_sale, urlSale}
}
// ## Lot List
_getLotInfoList = async (page, Elements) => {
let LotList = [];
for (let element of Elements) {
let Lot = {}
try{
let LotnameXPath = [
'.//a/div/div/div[5]/div/div[1]',
'.//a/div/div/div[4]/div/div[1]',
]
let Lotname = await this.getTextContentElement(LotnameXPath, page, element, 'LotnameXPath')
// idPlatform from the url
let LotUrlXPath = [
'.//a'
]
let urlLot = await this.getAttributeElement(LotUrlXPath, page, element, 'href', 'UrlListLot')
let match = urlLot.match(/lot-(.*).html/);
let idPlatform = match[1];
Lot = {
title: Lotname,
idPlatform: idPlatform,
platform: this._Name,
lotNumber: Lotname.split('Lot ')[1]
}
}catch(e){
console.error(e)
}
//console.log(LotList)
LotList.push(Lot);
};
return LotList;
}
getLotList = async (page) => {
let LotList = []
let NextBtn = false
do {
// extract Lot List
const LotListXPath = [
'//div[contains(@class, "sale-item-wrapper")]',
]
let Elements = await page.$x(LotListXPath[0]);
if (Elements.length > 0) {
LotList = [].concat(LotList, await this._getLotInfoList(page, Elements))
}
// search for the Button Next (only if enabled)
let NextPageButtonXPath = "//button[contains(@aria-label, 'Page suivante') and not(contains(@class, 'v-pagination__navigation--disabled'))]"
let NextPageButton = await page.$x(NextPageButtonXPath);
if (NextPageButton.length > 0) {
NextBtn = true
await NextPageButton[0].evaluate(b => b.click());
await page.waitForTimeout(1000);
console.log('Next Page')
}else{
NextBtn = false
}
} while (NextBtn);
return LotList
}
// ## Sale
getSaleTitle = async (page) => {
const SaleTitleXPath = [
'/html/body/div[1]/div/div/div/main/div/div/div/div/div/div[1]/div/div[1]/div[2]/h1/div/div/div/div/div',
'/html/body/div[1]/div/div/div/main/div/div/div/div/div/div[1]/div/div/div/h1/div/div/div/div/div',
'/html/body/div[1]/div/div/div/main/div/div/div/div/div/div[1]/div/div/div/div/h1/div/div/div/div/div',
'/html/body/div[1]/div/div/div[1]/main/div/div/div/div/div/div[1]/div/div/div/div/h1/div/div/div/div/div'
]
let SaleTitle = await this.getTextContent(SaleTitleXPath, page, 'SaleTitleXPath')
return SaleTitle
}
getSaleDate = async (page) => {
// Test if Live Sale
let BoolLive = false;
try {
const VideoXPath = [
'//*[@id="streaming-subscriber"]',
]
let VideoExists = await this.ElementExists(VideoXPath, page, 'VideoXPath')
console.log('VideoExists : '+VideoExists)
BoolLive = VideoExists
} catch (error) {}
let SaleDate;
// if futur sale
if(!BoolLive){
await page.waitForTimeout(400);
const SaleDateXPath = [
'/html/body/div[1]/div/div/div/main/div/div/div/div/div/div[1]/div/div[1]/div[2]/h2/div[2]/div/div[1]/span',
]
let SaleDateString = await this.getTextContent(SaleDateXPath, page, 'SaleDateXPath')
SaleDateString = SaleDateString.trim()
console.log('SaleDateString : '+SaleDateString)
const months = {
'janvier': '01',
'février': '02',
'mars': '03',
'avril': '04',
'mai': '05',
'juin': '06',
'juillet': '07',
'août': '08',
'septembre': '09',
'octobre': '10',
'novembre': '11',
'décembre': '12'
};
let SaleDateArr = SaleDateString.split(' ');
let day = parseInt(SaleDateArr[0].length === 1 ? '0'+SaleDateArr[0] : SaleDateArr[0]);
let month = parseInt(months[SaleDateArr[1]]);
let year = parseInt(SaleDateArr[2]);
let hour = parseInt(SaleDateArr[4].split('h')[0]);
let minute = parseInt(SaleDateArr[4].split('h')[1]);
SaleDate = moment.tz([year, month - 1, day, hour, minute], 'Europe/Paris').format();
// Live Sale
}else{
SaleDate = moment.tz('Europe/Paris').format();
}
console.log('SaleDate : '+SaleDate)
return SaleDate
}
getSaleLocation = async (page) => {
const SaleLocationXPath = [
'/html/body/div[1]/div/div/div/main/div/div/div/div/div/div[1]/div/div[1]/div[2]/h2/div[2]/div/div[2]/span',
'/html/body/div[1]/div/div/div/main/div/div/div/div/div/div[1]/div/div/div/h2/div[2]'
]
let SaleLocation = await this.getTextContent(SaleLocationXPath, page, 'SaleLocationXPath')
return SaleLocation.trim()
}
getSaleHouseName = async (page) => {
const SaleHouseNameXPath = [
'/html/body/div[1]/div/div/div/main/div/div/div/div/div/div[2]/div[2]/div/div/div/div/div[2]/div/div/div[2]/div[2]/div[2]/a',
'/html/body/div[1]/div/div/div/main/div/div/div/div/div/div[2]/div[3]/div/div/div/div/div[2]/div/div/div[2]/div[2]/div[2]/a',
'/html/body/div[1]/div/div/div/main/div/div/div/div/div/div[2]/div[2]/div/div/div/div/div[2]/div/div/div/div[2]/div[2]/a',
'/html/body/div[1]/div/div/div[1]/main/div/div/div/div/div/div[2]/div[3]/div/div/div/div/div[2]/div/div/div/div[2]/div[2]/a'
]
let SaleHouseName = await this.getTextContent(SaleHouseNameXPath, page, 'SaleHouseNameXPath')
SaleHouseName = SaleHouseName.replace(/[\n]/g, '').replace(/\s+/g, ' ').trim();
return SaleHouseName.trim()
}
}
module.exports = InterencheresData