Skip to content

Commit 8367e1c

Browse files
authored
Serp linkedin finder enrichment source (#2664)
1 parent 7292ede commit 8367e1c

File tree

15 files changed

+330
-76
lines changed

15 files changed

+330
-76
lines changed

services/apps/premium/members_enrichment_worker/src/activities.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import {
88
touchMemberEnrichmentCacheUpdatedAt,
99
updateMemberEnrichmentCache,
1010
} from './activities/enrichment'
11-
import { getMembers } from './activities/getMembers'
11+
import { getEnrichableMembers } from './activities/getMembers'
1212
import { refreshToken } from './activities/lf-auth0/authenticateLFAuth0'
1313
import {
1414
getIdentitiesExistInOtherMembers,
@@ -29,7 +29,7 @@ import {
2929
} from './activities/syncEnrichedData'
3030

3131
export {
32-
getMembers,
32+
getEnrichableMembers,
3333
getEnrichmentData,
3434
normalizeEnrichmentData,
3535
findMemberEnrichmentCache,

services/apps/premium/members_enrichment_worker/src/activities/getMembers.ts

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,25 @@
11
import { fetchMembersForEnrichment } from '@crowd/data-access-layer/src/old/apps/premium/members_enrichment_worker'
2-
import { IMember, IMemberEnrichmentSourceQueryInput, MemberEnrichmentSource } from '@crowd/types'
2+
import {
3+
IEnrichableMember,
4+
IMemberEnrichmentSourceQueryInput,
5+
MemberEnrichmentSource,
6+
} from '@crowd/types'
37

48
import { EnrichmentSourceServiceFactory } from '../factory'
59
import { svc } from '../main'
610

7-
export async function getMembers(
11+
export async function getEnrichableMembers(
812
limit: number,
913
sources: MemberEnrichmentSource[],
1014
afterId: string,
11-
): Promise<IMember[]> {
12-
let rows: IMember[] = []
15+
): Promise<IEnrichableMember[]> {
16+
let rows: IEnrichableMember[] = []
1317
const sourceInputs: IMemberEnrichmentSourceQueryInput[] = sources.map((s) => {
1418
const srv = EnrichmentSourceServiceFactory.getEnrichmentSourceService(s, svc.log)
1519
return {
1620
source: s,
1721
cacheObsoleteAfterSeconds: srv.cacheObsoleteAfterSeconds,
18-
enrichableBy: srv.enrichableBy,
22+
enrichableBySql: srv.enrichableBySql,
1923
}
2024
})
2125
const db = svc.postgres.reader

services/apps/premium/members_enrichment_worker/src/factory.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import { MemberEnrichmentSource } from '@crowd/types'
44

55
import EnrichmentServiceClearbit from './sources/clearbit/service'
66
import EnrichmentServiceProgAI from './sources/progai/service'
7+
import EnrichmentServiceSerpApi from './sources/serp/service'
78
import { IEnrichmentService } from './types'
89
import { ALSO_USE_EMAIL_IDENTITIES_FOR_ENRICHMENT, ENRICH_EMAIL_IDENTITIES } from './utils/config'
910

@@ -21,6 +22,8 @@ export class EnrichmentSourceServiceFactory {
2122
)
2223
case MemberEnrichmentSource.CLEARBIT:
2324
return new EnrichmentServiceClearbit(log)
25+
case MemberEnrichmentSource.SERP:
26+
return new EnrichmentServiceSerpApi(log)
2427
default:
2528
throw new Error(`Enrichment service for ${source} is not found!`)
2629
}

services/apps/premium/members_enrichment_worker/src/main.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ const config: Config = {
1010
'CROWD_ENRICHMENT_PROGAI_API_KEY',
1111
'CROWD_ENRICHMENT_CLEARBIT_URL',
1212
'CROWD_ENRICHMENT_CLEARBIT_API_KEY',
13+
'CROWD_ENRICHMENT_SERP_API_URL',
14+
'CROWD_ENRICHMENT_SERP_API_KEY',
1315
],
1416
producer: {
1517
enabled: false,

services/apps/premium/members_enrichment_worker/src/sources/clearbit/service.ts

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ import axios from 'axios'
22

33
import { Logger, LoggerBase } from '@crowd/logging'
44
import {
5-
IMemberEnrichmentSourceEnrichableBy,
65
MemberAttributeName,
76
MemberEnrichmentSource,
87
MemberIdentityType,
@@ -28,11 +27,7 @@ import {
2827
export default class EnrichmentServiceClearbit extends LoggerBase implements IEnrichmentService {
2928
public source: MemberEnrichmentSource = MemberEnrichmentSource.CLEARBIT
3029
public platform = `enrichment-${this.source}`
31-
public enrichableBy: IMemberEnrichmentSourceEnrichableBy[] = [
32-
{
33-
type: MemberIdentityType.EMAIL,
34-
},
35-
]
30+
public enrichableBySql = `mi.type = 'email' and mi.verified`
3631

3732
// bust cache after 120 days
3833
public cacheObsoleteAfterSeconds = 60 * 60 * 24 * 120
@@ -60,7 +55,7 @@ export default class EnrichmentServiceClearbit extends LoggerBase implements IEn
6055
}
6156

6257
isEnrichableBySource(input: IEnrichmentSourceInput): boolean {
63-
return !!input.email?.value
58+
return !!input.email?.value && input.email?.verified
6459
}
6560

6661
async getData(input: IEnrichmentSourceInput): Promise<IMemberEnrichmentDataClearbit | null> {

services/apps/premium/members_enrichment_worker/src/sources/progai/service.ts

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ import lodash from 'lodash'
33

44
import { Logger, LoggerBase } from '@crowd/logging'
55
import {
6-
IMemberEnrichmentSourceEnrichableBy,
76
MemberAttributeName,
87
MemberEnrichmentSource,
98
MemberIdentityType,
@@ -33,15 +32,8 @@ import {
3332
export default class EnrichmentServiceProgAI extends LoggerBase implements IEnrichmentService {
3433
public source: MemberEnrichmentSource = MemberEnrichmentSource.PROGAI
3534
public platform = `enrichment-${this.source}`
36-
public enrichableBy: IMemberEnrichmentSourceEnrichableBy[] = [
37-
{
38-
type: MemberIdentityType.USERNAME,
39-
platform: PlatformType.GITHUB,
40-
},
41-
{
42-
type: MemberIdentityType.EMAIL,
43-
},
44-
]
35+
36+
enrichableBySql = `mi.verified and ((mi.type = 'username' AND mi.platform = 'github') OR (mi.type = 'email'))`
4537

4638
// bust cache after 90 days
4739
public cacheObsoleteAfterSeconds = 60 * 60 * 24 * 90
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
import axios from 'axios'
2+
3+
import { Logger, LoggerBase } from '@crowd/logging'
4+
import { MemberEnrichmentSource, MemberIdentityType, PlatformType } from '@crowd/types'
5+
6+
import {
7+
IEnrichmentService,
8+
IEnrichmentSourceInput,
9+
IMemberEnrichmentDataNormalized,
10+
} from '../../types'
11+
12+
import { IMemberEnrichmentDataSerp, IMemberEnrichmentSerpApiResponse } from './types'
13+
14+
export default class EnrichmentServiceSerpApi extends LoggerBase implements IEnrichmentService {
15+
public source: MemberEnrichmentSource = MemberEnrichmentSource.SERP
16+
public platform = `enrichment-${this.source}`
17+
public enrichMembersWithActivityMoreThan = 10
18+
19+
public enrichableBySql = `
20+
("activitySummary".total_count > ${this.enrichMembersWithActivityMoreThan}) AND
21+
(members."displayName" like '% %') AND
22+
(members.attributes->'location'->>'default' is not null and members.attributes->'location'->>'default' <> '') AND
23+
((members.attributes->'websiteUrl'->>'default' is not null and members.attributes->'websiteUrl'->>'default' <> '') OR
24+
(mi.verified AND mi.type = 'username' and mi.platform = 'github') OR
25+
(mi.verified AND mi.type = 'email')
26+
)`
27+
28+
// bust cache after 120 days
29+
public cacheObsoleteAfterSeconds = 60 * 60 * 24 * 120
30+
31+
constructor(public readonly log: Logger) {
32+
super(log)
33+
}
34+
35+
isEnrichableBySource(input: IEnrichmentSourceInput): boolean {
36+
const displayNameSplit = input.displayName?.split(' ')
37+
return (
38+
displayNameSplit?.length > 1 &&
39+
!!input.location &&
40+
((!!input.email && input.email.verified) ||
41+
(!!input.github && input.github.verified) ||
42+
!!input.website)
43+
)
44+
}
45+
46+
async getData(input: IEnrichmentSourceInput): Promise<IMemberEnrichmentDataSerp | null> {
47+
let enriched: IMemberEnrichmentDataSerp = null
48+
49+
if (input.displayName && input.location && input.website) {
50+
enriched = await this.querySerpApi(input.displayName, input.location, input.website)
51+
}
52+
53+
if (!enriched && input.displayName && input.location && input.github && input.github.value) {
54+
enriched = await this.querySerpApi(input.displayName, input.location, input.github.value)
55+
}
56+
57+
if (!enriched && input.displayName && input.location && input.email && input.email.value) {
58+
enriched = await this.querySerpApi(input.displayName, input.location, input.email.value)
59+
}
60+
return enriched
61+
}
62+
63+
private async querySerpApi(
64+
displayName: string,
65+
location: string,
66+
identifier: string,
67+
): Promise<IMemberEnrichmentDataSerp> {
68+
const config = {
69+
method: 'get',
70+
url: process.env['CROWD_ENRICHMENT_SERP_API_URL'],
71+
params: {
72+
api_key: process.env['CROWD_ENRICHMENT_SERP_API_KEY'],
73+
q: `"${displayName}" ${location} "${identifier}" site:linkedin.com/in`,
74+
num: 3,
75+
engine: 'google',
76+
},
77+
}
78+
79+
const response: IMemberEnrichmentSerpApiResponse = (await axios(config)).data
80+
81+
if (response.search_information.total_results > 0) {
82+
if (
83+
response.organic_results.length > 0 &&
84+
response.organic_results[0].link &&
85+
!response.search_information.spelling_fix &&
86+
!response.search_information.spelling_fix_type
87+
) {
88+
return {
89+
linkedinUrl: response.organic_results[0].link,
90+
}
91+
}
92+
}
93+
94+
return null
95+
}
96+
97+
normalize(data: IMemberEnrichmentDataSerp): IMemberEnrichmentDataNormalized {
98+
const normalized: IMemberEnrichmentDataNormalized = {
99+
identities: [
100+
{
101+
platform: PlatformType.LINKEDIN,
102+
type: MemberIdentityType.USERNAME,
103+
verified: false,
104+
value: this.normalizeLinkedUrl(data.linkedinUrl),
105+
},
106+
],
107+
}
108+
return normalized
109+
}
110+
111+
private normalizeLinkedUrl(url: string): string {
112+
try {
113+
const parsedUrl = new URL(url)
114+
115+
if (parsedUrl.hostname.endsWith('linkedin.com')) {
116+
parsedUrl.hostname = 'linkedin.com'
117+
parsedUrl.search = ''
118+
119+
let path = parsedUrl.pathname
120+
if (path.endsWith('/')) {
121+
path = path.slice(0, -1)
122+
}
123+
124+
return parsedUrl.origin + path
125+
}
126+
127+
return url
128+
} catch (error) {
129+
this.log.error(`Error while normalizing linkedin url: ${url}`, error)
130+
throw error
131+
}
132+
}
133+
}
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
/* eslint-disable @typescript-eslint/no-explicit-any */
2+
import axios from 'axios'
3+
4+
// import { writeFileSync } from 'fs'
5+
// import { Parser } from 'json2csv'
6+
import { timeout } from '@crowd/common'
7+
8+
const testSerpApi = async () => {
9+
const members = [] as any[]
10+
11+
for (const mem of members) {
12+
const url = `https://serpapi.com/search.json`
13+
const config = {
14+
method: 'get',
15+
url,
16+
params: {
17+
api_key: process.env['CROWD_SERP_API_KEY'],
18+
q: `"${mem.displayName}" ${mem.location} "${mem.website}" site:linkedin.com/in`,
19+
num: 3,
20+
engine: 'google',
21+
},
22+
}
23+
24+
const response = (await axios(config)).data
25+
26+
if (response.search_information.total_results > 0) {
27+
if (
28+
response.organic_results.length > 0 &&
29+
response.organic_results[0].link &&
30+
!response.search_information.spelling_fix &&
31+
!response.search_information.spelling_fix_type
32+
) {
33+
console.log(`Found LinkedIn for ${mem.displayName}: ${response.organic_results[0].link}`)
34+
console.log(response.search_information)
35+
mem.linkedinFromSerp = response.organic_results[0].link
36+
} else {
37+
console.log(`No LinkedIn found for ${mem.displayName}`)
38+
}
39+
} else {
40+
console.log(`No LinkedIn found for ${mem.displayName}`)
41+
}
42+
43+
await timeout(1000)
44+
}
45+
46+
try {
47+
// const fields = [
48+
// 'id',
49+
// 'displayName',
50+
// 'location',
51+
// 'profileUrl',
52+
// 'website',
53+
// 'linkedinFromClearbit',
54+
// 'linkedinFromProgai',
55+
// 'linkedinFromSerp',
56+
// ]
57+
// const json2csvParser = new Parser({ fields })
58+
// const csv = json2csvParser.parse(members)
59+
// writeFileSync('output.csv', csv)
60+
// console.log('CSV file has been successfully written.')
61+
} catch (err) {
62+
console.error('Error writing CSV file:', err)
63+
}
64+
}
65+
66+
setImmediate(async () => {
67+
await testSerpApi()
68+
process.exit(0)
69+
})
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
export interface IMemberEnrichmentDataSerp {
2+
linkedinUrl: string
3+
}
4+
5+
export interface IMemberEnrichmentSerpApiResponse {
6+
organic_results: IMemberEnrichmentSerpApiResponseOrganicResult[]
7+
search_information: IMemberEnrichmentSerpApiResponseSearchInformation
8+
}
9+
10+
export interface IMemberEnrichmentSerpApiResponseSearchInformation {
11+
query_displayed: string
12+
total_results: number
13+
time_taken_displayed: number
14+
organic_results_state: string
15+
spelling_fix?: string
16+
spelling_fix_type?: string
17+
}
18+
19+
export interface IMemberEnrichmentSerpApiResponseOrganicResult {
20+
position: number
21+
title: string
22+
link: string
23+
redirect_link: string
24+
displayed_link: string
25+
favicon: string
26+
snippet: string
27+
snippet_highlighted_words: string[]
28+
sitelinks: {
29+
inline: IMemberEnrichmentSerpApiResponseOrganicResultSitelinkInline[]
30+
}
31+
source: string
32+
}
33+
34+
export interface IMemberEnrichmentSerpApiResponseOrganicResultSitelinkInline {
35+
title: string
36+
link: string
37+
}

0 commit comments

Comments
 (0)